Beispiel #1
0
    def __init__(self):
        # 抓取设置
        self.crawler = JMCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip = Common.local_ip()
        # router tag
        self._tag = 'ikuai'

        # wait time
        self.w_time = 1
Beispiel #2
0
    def __init__(self):
        # 品牌团抓取设置
        self.crawler = JMCrawler()
        self.crawling_time = Common.now() # 当前爬取时间
        self.crawling_time_s = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时
        self.crawling_confirm = 1 # 本活动是否需要爬取 1:没有开团需要抓取 2:已经开团 0:只需要更新商品位置

        # 类别
        self.act_platform = 1 # 品牌团所在平台 1:聚美-pc
        self.channel_id = 0 # 品牌团所在频道
        self.channel_name = '' # 品牌团所在频道
        self.channel_url = '' # 品牌团所在频道url
        self.category_id = 0 # 品牌团所在类别Id
        self.category_name = '' # 品牌团所在类别Name
        self.act_position = 0 # 品牌团所在类别位置
        self.front_category_id = 0 # 品牌团所在前端类别Id
        self.category_type = '0' # 品牌团分类的类型,'0':默认 '1': '2':
        self.sub_nav_name = '' # 活动所在分类下子导航Name

        # 是否在首页展示
        self.home_acts = {} # 首页品牌团信息
        self.act_home = 0 # 是否在首页展示,0:不在,1:存在
        self.act_home_position = '' # 在首页展示的位置
        self.act_home_dataType = '' # 在首页展示的所属栏目

        # 品牌团信息
        self.act_id = '' # 品牌团Id
        self.act_url = '' # 品牌团链接
        self.act_url_tail = '' # 品牌团链接后缀
        self.act_name = '' # 品牌团Name
        self.act_desc = '' # 品牌团描述
        self.act_logopic_url = '' # 品牌团Logo图片链接
        self.act_enterpic_url = '' # 品牌团展示图片链接
        self.act_times = '' # 品牌团时间区间
        self.act_discounts = '' # 品牌团折扣信息
        self.act_start_time = 0.0 # 品牌团开团时间
        self.act_start_time_s = '' # 品牌团开团时间字符串形式
        self.act_start_date = '' # 品牌团开团日期
        self.act_time_diff = '' # sale timediff, will start_time
        self.server_time = '' # crawler jm server time
        self.act_end_time = 0.0 # 品牌团结束时间
        self.act_end_time_s = '' # 品牌团结束时间字符串形式
        self.act_status = '' # 品牌团状态
        self.act_sign = 1 # 品牌团标识 1:普通品牌团,2:拼团,3:
        self.act_other_ids = '' # 如果是拼团, 其他团的ID
        self.act_brand = '' # 品牌团品牌信息
        self.act_brand_id = '' # 品牌团品牌Id

        # 店铺信息
        self.act_seller_id = '' # 品牌团卖家Id
        self.act_seller_name = '' # 品牌团卖家Name (回填)
        self.act_shop_id = '' # 品牌团店铺Id (回填)
        self.act_shop_name = '' # 品牌团店铺Name (回填)

        # 品牌团交易信息
        self.act_soldcount = 0 # 品牌团成交数
        self.act_remindnum = 0 # 品牌团关注人数
        self.act_discount = '' # 品牌团打折
        self.act_coupon = 0 # 品牌团优惠券, 默认0没有
        self.act_coupons = [] # 优惠券内容list

        # 品牌团商品
        self.act_itemids = []
        self.act_itemval_d = {}
        self.act_itemval_list = []

        # 原数据信息
        self.act_pagedata = '' # 品牌团所在数据项所有内容
        self.act_page = '' # 品牌团页面html内容
        self.act_pages = {} # 品牌页面内请求数据列表
Beispiel #3
0
class RetryCrawler():
    '''A class of retry crawl data'''
    def __init__(self):
        # 抓取设置
        self.crawler = JMCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip = Common.local_ip()
        # router tag
        self._tag = 'ikuai'

        # wait time
        self.w_time = 1

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            Common.log('# To dial router exception: %s' % e)

    def getData(self, url, refers='', max_retry=20):
        page = ''
        retry = 1
        while True:
            try:
                page = self.crawler.getData(url, refers)
                break
            except Common.InvalidPageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# Invalid page exception: %s' % e)
                time.sleep(self.w_time*retry)
            except Common.DenypageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# Deny page exception: %s' % e)
                # 重新拨号
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    Common.log('# DailClient Exception err: %s' % e)
                time.sleep(random.uniform(10,30))

            except Common.SystemBusyException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# System busy exception: %s' % e)
                time.sleep(self.w_time*retry)
            except Exception as e:
                Common.log('# exception err in retry crawler: %s' % e)
                if str(e).find('Read timed out') != -1:
                    if retry >= max_retry:
                        break
                    retry += 1
                    time.sleep(random.uniform(1,3))
                elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'):
                    if retry >= max_retry:
                        break
                    retry += 1
                    
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                    time.sleep(random.uniform(10,30))
                else:
                    break

        return page
Beispiel #4
0
class Act():
    '''A class of JM activity'''
    def __init__(self):
        # 品牌团抓取设置
        self.crawler = JMCrawler()
        self.crawling_time = Common.now() # 当前爬取时间
        self.crawling_time_s = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时
        self.crawling_confirm = 1 # 本活动是否需要爬取 1:没有开团需要抓取 2:已经开团 0:只需要更新商品位置

        # 类别
        self.act_platform = 1 # 品牌团所在平台 1:聚美-pc
        self.channel_id = 0 # 品牌团所在频道
        self.channel_name = '' # 品牌团所在频道
        self.channel_url = '' # 品牌团所在频道url
        self.category_id = 0 # 品牌团所在类别Id
        self.category_name = '' # 品牌团所在类别Name
        self.act_position = 0 # 品牌团所在类别位置
        self.front_category_id = 0 # 品牌团所在前端类别Id
        self.category_type = '0' # 品牌团分类的类型,'0':默认 '1': '2':
        self.sub_nav_name = '' # 活动所在分类下子导航Name

        # 是否在首页展示
        self.home_acts = {} # 首页品牌团信息
        self.act_home = 0 # 是否在首页展示,0:不在,1:存在
        self.act_home_position = '' # 在首页展示的位置
        self.act_home_dataType = '' # 在首页展示的所属栏目

        # 品牌团信息
        self.act_id = '' # 品牌团Id
        self.act_url = '' # 品牌团链接
        self.act_url_tail = '' # 品牌团链接后缀
        self.act_name = '' # 品牌团Name
        self.act_desc = '' # 品牌团描述
        self.act_logopic_url = '' # 品牌团Logo图片链接
        self.act_enterpic_url = '' # 品牌团展示图片链接
        self.act_times = '' # 品牌团时间区间
        self.act_discounts = '' # 品牌团折扣信息
        self.act_start_time = 0.0 # 品牌团开团时间
        self.act_start_time_s = '' # 品牌团开团时间字符串形式
        self.act_start_date = '' # 品牌团开团日期
        self.act_time_diff = '' # sale timediff, will start_time
        self.server_time = '' # crawler jm server time
        self.act_end_time = 0.0 # 品牌团结束时间
        self.act_end_time_s = '' # 品牌团结束时间字符串形式
        self.act_status = '' # 品牌团状态
        self.act_sign = 1 # 品牌团标识 1:普通品牌团,2:拼团,3:
        self.act_other_ids = '' # 如果是拼团, 其他团的ID
        self.act_brand = '' # 品牌团品牌信息
        self.act_brand_id = '' # 品牌团品牌Id

        # 店铺信息
        self.act_seller_id = '' # 品牌团卖家Id
        self.act_seller_name = '' # 品牌团卖家Name (回填)
        self.act_shop_id = '' # 品牌团店铺Id (回填)
        self.act_shop_name = '' # 品牌团店铺Name (回填)

        # 品牌团交易信息
        self.act_soldcount = 0 # 品牌团成交数
        self.act_remindnum = 0 # 品牌团关注人数
        self.act_discount = '' # 品牌团打折
        self.act_coupon = 0 # 品牌团优惠券, 默认0没有
        self.act_coupons = [] # 优惠券内容list

        # 品牌团商品
        self.act_itemids = []
        self.act_itemval_d = {}
        self.act_itemval_list = []

        # 原数据信息
        self.act_pagedata = '' # 品牌团所在数据项所有内容
        self.act_page = '' # 品牌团页面html内容
        self.act_pages = {} # 品牌页面内请求数据列表

    # 品牌团信息
    def itemConfig(self):
        page = self.act_page
        m = re.search(r'JM\.brand_id\s*=\s*(\d+);', page, flags=re.S)
        if m:
            self.act_brand_id = m.group(1)
        m = re.search(r'JM\.special_start_time\s*=\s*(\d+);', page, flags=re.S)
        if m:
            self.act_start_time = m.group(1)
            if float(self.act_start_time) < Common.now() and self.act_start_time != self.act_time_diff:
                self.act_end_time = float(self.server_time) + float(self.act_time_diff)

        self.actUrltail(page)

    def actUrltail(self, page):
        m = re.search(r'JM\.specialSym\s*=\s*"(.+?)";', page, flags=re.S)
        if m:
            self.act_url_tail = m.group(1)
        else:
            m = re.search(r'/(\w+)\.html', self.act_url)
            if m:
                self.act_url_tail = m.group(1)
            else:
                m = re.search(r'/.+?-(\w+)\.html', self.act_url)
                if m:
                    self.act_url_tail = m.group(1)

    # 从品牌团页获取商品数据
    def actItems(self):
        page = self.act_page
        if self.act_url_tail == '':
            self.actUrltail(page)

        i_p = 0
        p = re.compile(r'<.+? href="(http://www.jumei.com/i/deal/.+?)".+?>', flags=re.S)
        i_p = self.findItemsByUrl(p, page, i_p)

        p = re.compile(r'<.+? href="(http://www.jumeiglobal.com/deal/.+?)".+?>', flags=re.S)
        i_p = self.findItemsByUrl(p, page, i_p)

        p = re.compile(r'<.+? href="(http://item.jumei.com/.+?)".+?>', flags=re.S)
        i_p = self.findItemsByUrl(p, page, i_p)

        p = re.compile(r'<.+? href="(http://item.jumeiglobal.com/.+?)".+?>', flags=re.S)
        i_p = self.findItemsByUrl(p, page, i_p)

        # ajax
        i_p = self.actAjax(page, i_p)

    def findItemsByUrl(self, p, page, i_p):
        for url_str in p.finditer(page):
            i_p += 1
            i_url = url_str.group(1)
            #Common.log('%s %s' % (i_url, str(i_p)))
            self.itemVal((i_url,i_p))
        return i_p

    # 品牌团ajax
    def actAjax(self, page, i_p):
        p = re.compile(r'<div class="act_container" data-model-id\s*=\s*"(.+?)" data-floor-id="(.+?)" data-preview-id="(.+?)".+?>', flags=re.S)
        for a_container in p.finditer(page):
            a_model, f_id, p_id = a_container.group(1), a_container.group(2), a_container.group(3)
            if int(a_model) == 3:
                i_p = self.actAjax1(a_model, f_id, p_id, i_p)

        return i_p

    # ajax type 1
    def actAjax1(self, a_model, f_id, p_id, i_p):
        p_url = 'http://hd.jumei.com/ajax/get_shelfmodel%s/%s_%s_%s_%s_%s_%s_ShellCallbackDataShow%s.json?callback=ShellCallbackDataShow%s'
        p_size = 30
        p_index = 0
        a_url = p_url % (str(a_model), str(self.act_id), str(a_model), str(p_id), str(p_size), str(p_index), str(self.act_start_time), str(f_id), str(f_id)) 
        #Common.log(a_url)
        result = self.get_itemjson(a_url, self.act_url, 'ShellCallbackDataShow%s' % (str(f_id)))
        i_p = self.parse_item(result, i_p, p_id, p_index)
        # 分页接口中获取数据
        t, r_data = result
        totalPage = 1
        if t == 'd' and r_data.has_key('data') and r_data['data'].has_key('count_num'):
            totalPage = int(r_data['data']['count_num']) /  p_size + 1
        elif t == 's':
            m = re.search(r'"count_num":(\d+),', r_data, flags=re.S)
            if m:
                totalPage = int(m.group(1)) / p_size + 1
        if totalPage > 1:
            for page_i in range(2, totalPage+1):
                p_index = (page_i - 1) * p_size
                a_url = p_url % (str(a_model), str(self.act_id), str(a_model), str(p_id), str(p_size), str(p_index), str(self.act_start_time), str(f_id), str(f_id))
                #Common.log(a_url)
                result = self.get_itemjson(a_url, self.act_url, 'ShellCallbackDataShow%s' % (str(f_id)))
                i_p = self.parse_item(result, i_p, p_id, p_index)

        return i_p

    def get_itemjson(self, a_url, refers, a_back):
        result_data = None
        r_page = self.crawler.getData(a_url, refers)
        if not r_page or r_page == '': raise Common.InvalidPageException("# get_itemjson: get item json data empty, url:%s."%(a_url))
        m = re.search(r'%s\((.+?)\);$' % a_back, r_page, flags=re.S)
        if m:
            result = m.group(1)
        else:
            raise Common.InvalidPageException("# get_itemjson: not get item json data, url:%s, result:%s."%(a_url, str(r_page)))
        try:
            result_data = json.loads(result)
            return ('d', result_data)
        except Exception as e:
            Common.log('# exception err in get_jsonData load json: %s' % e)
            Common.log('# return string: %s' % result)
            return ('s', result)

    def parse_item(self, result, i_p, p_id, p_index):
        t, r_data = result
        if t == 'd':
            if r_data.has_key('data') and r_data['data'].has_key('products'):
                for product in r_data['data']['products']:
                    i_p += 1
                    p_index += 1
                    from_s = None
                    if self.act_url_tail != '':
                        from_s = '%s_pos_%s_%s1' % (self.act_url_tail, str(p_index), str(p_id))
                    self.parse_dictitem(product, i_p, from_s)
        else:
            m = re.search(r'"data":{"products":(\[{.+?}\]),.+?}}', r_data, flags=re.S)
            if m:
                p = re.compile(r'({"hash_id":.+?"pic_url":".+?"})', flags=re.S)
                for data in p.finditer(r_data):
                    i_p += 1
                    p_index += 1
                    from_s = None
                    if self.act_url_tail != '':
                        from_s = '%s_pos_%s_%s1' % (self.act_url_tail, str(p_index), str(p_id))
                    self.parse_stritem(data.group(1), i_p, from_s)
        return i_p

    def parse_dictitem(self, product, i_p, from_s=None):
        i_id = ''
        if product.has_key('hash_id'):
            i_id = product['hash_id']

        i_name = ''
        if product.has_key('medium_name'):
            i_name = product['medium_name']
        else:
            if product.has_key('short_name'):
                i_name = product['short_name']

        if float(self.act_start_time) < Common.now():
            u_status = 'zs'
        else:
            u_status = 'yr'
        if product.has_key('sellable'):
            if int(product['sellable']) == 0:
                u_status = 'end'

        p_url = 'http://item.jumei.com'
        if product.has_key('category'):
            if product['category'].find('global') != -1:
                p_url = 'http://item.jumeiglobal.com'

        if from_s:
            i_url = '%s/%s.html?from=%s&status=%s' % (p_url, i_id, from_s, u_status)
        else:
            i_url = '%s/%s.html' % (p_url, i_id)
            
        val = (product, i_id, i_name, i_url, i_p)
        self.itemVal(val)

    def parse_stritem(self, product, i_p, from_s=None):
        i_id = ''
        m = re.search(r'"hash_id":"(.+?)",', product, flags=re.S)
        if m:
            i_id = m.group(1)

        i_name = ''
        m = re.search(r'"short_name":"(.+?)",', product, flags=re.S)
        if m:
            i_name = m.group(1)
        else:
            m = re.search(r'"medium_name":"(.+?)",', product, flags=re.S)
            if m:
                i_name = m.group(1)

        if float(self.act_start_time) < Common.now():
            u_status = 'zs'
        else:
            u_status = 'yr'
        m = re.search(r'"sellable":\s*(\d+)', product, flags=re.S)
        if m:
            i_status = m.group(1)
            if int(i_status) == 0:
                u_status = 'end'

        p_url = 'http://item.jumei.com'
        m = re.search(r'"category":"(.+?)",', product, flags=re.S)
        if m:
            p_cate = m.group(1)
            if p_cate.find('global') != -1:
                p_url = 'http://item.jumeiglobal.com'
        if from_s:
            i_url = '%s/%s.html?from=%s&status=%s' % (p_url, i_id, from_s, u_status)
        else:
            i_url = '%s/%s.html' % (p_url, i_id)
            
        val = (product, i_id, i_name, i_url, i_p)
        self.itemVal(val)

    # 返回商品信息
    def itemVal(self, val):
        data, i_id, i_name, i_url, i_position = '', '', '', '', 0
        v_l = len(val)
        if val and len(val) > 0:
            v_l = len(val)
            i_val = None
            if v_l == 2:
                i_url, i_position = val
                if i_url and i_url != '':
                    m = re.search(r'/(\w+)\.html', i_url)
                    if m:
                        i_id = m.group(1)
                        if i_url.find('jumeiglobal') != -1:
                            i_url = 'http://item.jumeiglobal.com/%s.html?%s' % (i_id, i_url.split('?')[1])
                        else:
                            i_url = 'http://item.jumei.com/%s.html?%s' % (i_id, i_url.split('?')[1])
            else:
                data, i_id, i_name, i_url, i_position = val
            if i_url != '' and i_id != '':
                i_val = (self.act_id, self.act_name, self.act_url, data, i_id, i_name, i_url, i_position)
                #Common.log(i_val)
                positions = ''
                if self.act_itemval_d.has_key(i_id):
                    item = self.act_itemval_d[i_id]
                    if item[3] == '' and data != '':
                        self.act_itemval_d[i_id] = i_val
                    positions = str(item[7]) + '----' + str(i_position)
                else:
                    #self.act_itemval_list.append(i_val)
                    self.act_itemval_d[i_id] = i_val
                    if i_id and i_id != '':
                        self.act_itemids.append(i_id)
                    positions = str(i_position)
                #Common.log(self.act_itemval_d[i_id])
                #Common.log('PO:'+positions)

    # 品牌团页面
    def actPage(self):
        if self.act_url and self.act_url != '':
            data = self.crawler.getData(self.act_url, self.channel_url)
            if not data and data == '': raise Common.InvalidPageException("# actPage:not find act page,act_id:%s,act_name:%s,act_url:%s"%(str(self.act_id), self.act_name, self.act_url))
            if data and data != '':
                self.act_page = data
                self.act_pages['act-home'] = (self.act_url, data)

    # 品牌团信息和其中商品基本信息
    def antPage(self, val):
        self.channel_id, self.channel_name, self.channel_url, self.act_position, self.act_id, self.act_url, self.act_name, self.act_desc, self.act_logopic_url, self.act_enterpic_url, self.act_times, self.act_discounts, self.act_time_diff, self.server_time, self.crawling_begintime = val
        # 本次抓取开始日期
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime))
        # 本次抓取开始小时
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime))
        self.actPage()
        self.itemConfig()
        self.actItems()


    # 输出活动的网页
    def outItemPage(self,crawl_type):
        if self.crawling_begintime != '':
            time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_begintime))
        else:
            time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
        # timeStr_jmtype_webtype_act_crawltype_actid
        key = '%s_%s_%s_%s_%s_%s' % (time_s,Config.JM_TYPE,'1','act',crawl_type,str(self.act_id))
        pages = {}
        for p_tag in self.act_pages.keys():
            p_url, p_content = self.act_pages[p_tag]
            f_content = '<!-- url=%s --> %s' %(p_url, p_content)
            pages[p_tag] = f_content.strip()
        return (key,pages)

    # 写html文件
    def writeLog(self, time_path):
        try:
            return None
            pages = self.outItemLog()
            for page in pages:
                filepath = Config.pagePath + time_path + page[2]
                Config.createPath(filepath)
                filename = filepath + page[0]
                fout = open(filename, 'w')
                fout.write(page[3])
                fout.close()
        except Exception as e:
            Common.log('# exception err in writeLog info: %s' % e)

    # 输出抓取的网页log
    def outItemLog(self):
        pages = []
        for p_tag in self.act_pages.keys():
            p_url, p_content = self.act_pages[p_tag]

            # 网页文件名
            f_path = '%s_act/' %(self.act_id)
            f_name = '%s-%s_%d.htm' %(self.act_id, p_tag, self.crawling_time)

            # 网页文件内容
            f_content = '<!-- url=%s -->\n%s\n' %(p_url, p_content)
            pages.append((f_name, p_tag, f_path, f_content))

        return pages

    def outSql(self):
        act_start_time = ''
        if self.act_start_time and float(self.act_start_time) != 0.0 and int(self.act_start_time) > 0:
            act_start_time = Common.time_s(float(self.act_start_time))
        act_end_time = ''
        if self.act_end_time and float(self.act_end_time) != 0.0 and int(self.act_end_time) > 0:
            act_end_time = Common.time_s(float(self.act_end_time))
        return (Common.time_s(self.crawling_time),self.channel_id,self.channel_name,self.act_id,self.act_name,self.act_desc,self.act_platform,self.act_position,self.act_url,self.act_logopic_url,self.act_enterpic_url,self.act_brand_id,act_start_time,act_end_time,self.crawling_beginDate,self.crawling_beginHour)