Example #1
0
    def __init__(self, jhs_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_type = jhs_type # h:每小时, i:商品信息详情

        # appendix val
        self.a_val = a_val
        
        # activity items
        self.items = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
        #self._tag = 'tpent'

        # give up item, retry too many times
        self.giveup_items = []
Example #2
0
    def __init__(self, jhs_type, thread_num = 15, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex      = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # appendix val
        self.a_val = a_val

        # jhs queue type
        self.jhs_type   = jhs_type # 1:即将上线品牌团频道页, 2:检查每天还没结束的活动, 3:新增活动
        
        # activity items
        self.items      = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
Example #3
0
    def __init__(self):
        # 抓取设置
        self.crawler = TBCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip = Common.local_ip()
        # router tag
        self._tag = 'ikuai'

        # wait time
        self.w_time = 1
Example #4
0
    def __init__(self):
        # 抓取设置
        self.crawler = TBCrawler()
        self.val_queue = Queue.Queue()

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
Example #5
0
    def init_crawl(self, _obj, _crawl_type):
        self._obj           = _obj
        self._crawl_type    = _crawl_type

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._router_tag    = 'ikuai'
        #self._router_tag   = 'tpent'

        # items
        self.items          = []

        # giveup items
        self.giveup_items   = []

        # giveup msg val
        self.giveup_val     = None
Example #6
0
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.JHS_Brand

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self._q_type        = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Example #7
0
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.TC_Spot

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess()   # mysql access
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self._q_type        = _q_type # new:新增商品

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Example #8
0
    def __init__(self, key, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.tc_type        = Config.TC_TYPE # tc type
        #self.item_type      = q_type        # item queue type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self.tc_queue_type  = q_type # new...
        self._key           = key   # redis queue key

        # appendix val
        self.a_val          = a_val

        # return items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Example #9
0
    def __init__(self, itemtype, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.jhs_type       = Config.JHS_TYPE # jhs type
        self.item_type      = itemtype      # item type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_queue_type = q_type     # h:每小时
        self._key           = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type)

        # appendix val
        self.a_val          = a_val

        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Example #10
0
class Jsonpage():
    '''A class of json page'''
    def __init__(self):
        # 抓取设置
        self.crawler = TBCrawler()
        self.val_queue = Queue.Queue()

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'

    def putVal(self, _val):
        self.val_queue.put((0,_val),block=False)

    def putVals(self, _vals):
        for _val in _vals: self.val_queue.put((0, _val),block=False)

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.json_crawl_retry:
            _data = (_retry, _val)
            self.val_queue.put(_data,block=False)
        else:
            print "# retry too many times, no get json:", _val

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def get_json(self, json_valList):
        bResult_list = []
        if json_valList and json_valList != []:
            self.putVals(json_valList)
            while True:
                _data = None
                try:
                    try:
                        # 取队列消息
                        _data = self.val_queue.get(block=False)
                    except Empty as e:
                        break
                    _val = _data[1]
                    a_url, refers, a_val = _val
                    bResult_list += self.get_jsonPage(a_url,refers,a_val)
                    # 通知queue, task结束
                    self.val_queue.task_done()
                except Common.InvalidPageException as e:
                    print '# Invalid page exception:',e
                    # 通知queue, task结束
                    self.val_queue.task_done()
                    self.crawlRetry(_data)
                except Common.DenypageException as e:
                    print '# Deny page exception:',e
                    # 通知queue, task结束
                    self.val_queue.task_done()
                    self.crawlRetry(_data)
                    time.sleep(60)
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                        time.sleep(random.uniform(10,30))
                    time.sleep(random.uniform(10,30))

                except Common.SystemBusyException as e:
                    print '# System busy exception:',e
                    # 通知queue, task结束
                    self.val_queue.task_done()
                    self.crawlRetry(_data)
                    time.sleep(random.uniform(10,30))
                except Exception as e:
                    print '# exception err:',e
                    # 通知queue, task结束
                    self.val_queue.task_done()
                    if str(e).find('Read timed out') != -1:
                        self.crawlRetry(_data)
                    elif str(e).find('Name or service not known') != -1:
                        self.crawlRetry(_data)
                    time.sleep(random.uniform(10,30))
        return bResult_list

    # 通过数据接口获取每一页的数据
    def get_jsonPage(self, url, refers='', a_val=()):
        bResult_list = []
        ts = str(int(time.time()*1000)) + '_' + str(random.randint(0,9999))
        p_url = url + '&_ksTS=%s'%ts
        #print p_url
        result = self.get_jsonData(p_url, refers)
        bResult_list.append((result,)+a_val)
        # 分页从接口中获取数据
        totalPage = 1
        if type(result) is dict and result.has_key('totalPage'):
            totalPage = int(result['totalPage'])
        elif type(result) is str:
            m = re.search(r'"totalPage":(\d+),', result, flags=re.S)
            if m:
                totalPage = int(m.group(1))
        if totalPage > 1:
            for page_i in range(2, totalPage+1):
                ts = str(int(time.time()*1000)) + '_' + str(random.randint(0,9999))
                #p_url = re.sub('page=\d+&', 'page=%d&'%page_i, p_url)
                m = re.search(r'page=\d+&',p_url)
                if m:
                    p_url = re.sub('page=\d+&', 'page=%d&'%page_i, p_url)
                else:
                    p_url = p_url + '&page=%d'%page_i
                p_url = re.sub('&_ksTS=\d+_\d+', '&_ksTS=%s'%ts, p_url)
                result = self.get_jsonData(p_url, refers)
                if result:
                    bResult_list.append((result,)+a_val)

        return bResult_list

    # 获取每一页数据
    def get_jsonData(self, url, refers=''):
        result = None
        b_page = self.crawler.getData(url, refers)
        if not b_page or b_page == '': raise Common.InvalidPageException("# Jsonpage get_jsonData: not get jsondata url:%s."%(url))
        try:
            b_page = re.sub('^]', '', b_page)
            result = json.loads(b_page)
        except Exception as e:
            print '# exception err in get_jsonData load json:',e
            print '# return string:',b_page
            return b_page
        return result

    # 解析每一页的数据
    def parser_brandjson(self, bResult_list, a_val=None):
        print '# brand activities parse json start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 获取多线程需要的字段val
        act_valList = []
        # 前一页的数据量,用于计算活动所在的位置
        prepage_count = 0
        for page in bResult_list:
            page_info = page[0]
            activities = []
            currentPage = 1
            if type(page_info) is dict and page_info.has_key('brandList') and page_info['brandList'] != []:
                activities = page_info['brandList']
                if page_info.has_key('currentPage'):
                    currentPage = int(page_info['currentPage'])
            elif type(page_info) is str:
                m = re.search(r'"brandList":\[(.+?}})\]', page_info, flags=re.S)
                if m:
                    brandlist_info = m.group(1)
                    p = re.compile(r'({"baseInfo":.+?}})')
                    for brand_info in p.finditer(brandlist_info):
                        brand = brand_info.group(1)
                        activities.append(brand)
                    m = re.search(r'"currentPage":(\d+),', page_info, flags=re.S)
                    if m:
                        currentPage = int(m.group(1))
                else:
                    continue
            else:
                continue
            print '# brand every page num:',len(activities)

            b_position_start = 0
            if currentPage > 1:
                b_position_start = (currentPage - 1) * prepage_count
            else:
                # 保存前一页的数据条数
                prepage_count = len(activities)

            for i in range(0,len(activities)):
                activity = activities[i]
                if a_val:
                    val = (activity, page[1], page[2], (b_position_start+i+1)) + a_val
                else:
                    val = (activity, page[1], page[2], (b_position_start+i+1))
                act_valList.append(val)
        print '# brand activities parse json end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return act_valList


    # 解析每一页的商品数据
    def parser_itemjson(self, iResult_list):
        print '# items parse json start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 获取多线程需要的字段val
        item_valList = []
        # 前一页的数据量,用于计算商品所在的位置
        prepage_count = 0
        for page in iResult_list:
            page_info = page[0]
            a_val = page[1:]
            items = []
            currentPage = 1
            if type(page_info) is dict and page_info.has_key('itemList') and page_info['itemList'] != []:
                items = page_info['itemList']
                if page_info.has_key('currentPage'):
                    currentPage = int(page_info['currentPage'])
            elif type(page_info) is str:
                m = re.search(r'"itemList":\[(.+?}})\]', page_info, flags=re.S)
                if m:
                    itemlist_info = m.group(1)
                    p = re.compile(r'({"baseinfo":.+?}})',re.I)
                    for item_info in p.finditer(itemlist_info):
                        item = item_info.group(1)
                        items.append(item)
                    m = re.search(r'"currentPage":(\d+),', page_info, flags=re.S)
                    if m:
                        currentPage = int(m.group(1))
                else:
                    continue
            else:
                continue
            print '# item every page num:',len(items)

            i_position_start = 0
            if currentPage > 1:
                i_position_start = (currentPage - 1) * prepage_count
            else:
                # 保存前一页的数据条数
                prepage_count = len(items)

            for i in range(0,len(items)):
                item = items[i]
                if a_val:
                    item_val = (item,) + (a_val + ((i_position_start+i+1),))
                else:
                    item_val = (item, (i_position_start+i+1))
                item_valList.append(item_val)
        print '# items parse json end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return item_valList
Example #11
0
class JHSWorker():
    '''A class of jhs worker'''
    def __init__(self):
        # jhs brand type
        self.worker_type    = Config.JHS_Brand
        # DB
        self.jhs_type       = Config.JHS_TYPE   # queue type
        self.mysqlAccess    = MysqlAccess()     # mysql access
        self.redisQueue     = RedisQueue()      # redis queue
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess()   # mongodb fs access

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 页面模板解析
        self.brand_temp     = JHSBrandTEMP()

        # message
        self.message        = Message()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

    def init_crawl(self, _obj, _crawl_type):
        self._obj           = _obj
        self._crawl_type    = _crawl_type

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._router_tag    = 'ikuai'
        #self._router_tag   = 'tpent'

        # items
        self.items          = []

        # giveup items
        self.giveup_items   = []

        # giveup msg val
        self.giveup_val     = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'cat':
            max_time = Config.json_crawl_retry
        elif _obj == 'act':
            max_time = Config.act_crawl_retry
        elif _obj == 'item':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

     # To crawl page
    def crawlPage(self, _obj, _crawl_type, _key, msg, _val):
        try:
            if _obj == 'cat':
                if _crawl_type == 'home' or _crawl_type == 'homeposition':
                    self.run_cat_home(msg, _val)
                else:
                    self.run_cat(msg, _val)
            elif _obj == 'act':
                self.run_act(msg)
            elif _obj == 'item':
                self.run_item(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % _obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_cat_home(self, msg, _val):
        msg_val = msg["val"]
        _url, refers = msg_val
        print '# brand home:',_url
        page = self.crawler.getData(_url, refers)
        # save to mongo
        # timeStr_jhstype_webtype_obj_crawltype
        time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
        key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type)
        p_content = '<!-- url=%s --> %s' % (_url,page)
        self.mongofsAccess.insertJHSPages((key,p_content))

        c_url_val_list = self.brand_temp.temp(page)
        for c_url_val in c_url_val_list:
            c_url, c_name, c_id = c_url_val
            self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand))

        if self._crawl_type == 'homeposition':
            top_acts = self.brand_temp.activityTopbrandTemp(page)
            print top_acts
            self.save_top_acts(top_acts)

    def save_top_acts(self, top_acts):
        if top_acts:
            for key in top_acts.keys():
                act = top_acts[key]
                c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', ''
                c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time))
                if act.has_key('act_id'):
                    act_id = act["act_id"]
                if act.has_key('position'):
                    act_position = act["position"]
                if act.has_key('url'):
                    act_url = act["url"]
                if act.has_key('datatype'):
                    f_name = act["datatype"]
                val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour)
                self.mysqlAccess.insertJhsActPosition_hour(val)

    def run_cat(self, msg, _val):
        msg_val = msg["val"]
        c_url, c_id, c_name, refers, pagetype = msg_val
        print '# category',c_name,c_id
        if pagetype == Config.JHS_Brand:
            a_val = (c_id, c_name)
            self.get_actjson(c_url, refers, a_val, _val, pagetype)
        elif pagetype == Config.JHS_GroupItem:
            self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype)
        else:
            print '# not get category pagetype...'

    def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype):
        a_val = (c_id, c_name)
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_id,c_name)
        ajax_url_list = self.getAjaxurlList(page_val)
        if len(ajax_url_list) > 0:
            # process ajax url list
            for url_val in ajax_url_list:
                c_url,c_subNav = url_val
                self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav)

    def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''):
        if self._crawl_type == 'position':
            _val = (pagetype,c_subNav) + _val

        Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val)
        if Result_list and len(Result_list) > 0:
            # parser act result
            act_valList = self.jsonpage.parser_brandjson(Result_list,_val)
            if act_valList and len(act_valList) > 0:
                print '# get brand act num:',len(act_valList)
                self.items.extend(act_valList)
            else:
                print '# not get brandjson parse val list...'

    # get json ajax url
    def getAjaxurlList(self, page_val):
        url_list = []
        page, c_id, c_name = page_val
        p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = ''
            f_id = a_info.group(1)
            a_url = a_info.group(2).replace('amp;','')
            info = a_info.group(3)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            if c_subNav == '':
                m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S)
                if m:
                    c_subNav = re.sub(r'<.+?>','',m.group(1))
            #url_list.append((a_url,refers,a_val))
            url_list.append((a_url,c_subNav))
            i += 1
        return url_list

    # ACT queue
    def run_act(self, msg):
        # 默认数据
        msg_val = msg["val"]
        print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        act_obj = None
        if self._crawl_type == 'main':
            act_obj = JHSAct()
            act_obj.antPageMain(msg_val)
        elif self._crawl_type == 'check':
            act_obj = JHSAct()
            act_obj.antPageCheck(msg_val)
        elif self._crawl_type == 'position':
            act_obj = JHSAct()
            act_obj.antPageParser(msg_val)
        print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        if self._crawl_type == 'position':
            brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition()
            if int(brandact_sign) != 3:
                if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time):
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
                
                elif brandact_status != '' and brandact_status != 'blank':
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
        else:
            act_keys = [self.worker_type, str(act_obj.brandact_id)]
            prev_act = self.redisAccess.read_jhsact(act_keys)
            # 是否需要抓取商品
            if act_obj and act_obj.crawling_confirm != 2:
                # 保存的活动信息
                self.putActDB(act_obj, prev_act)
                # 活动中的商品
                items_list = []
                # 只取非俪人购商品
                if int(act_obj.brandact_sign) != 3:
                    if act_obj.crawling_confirm == 0:
                        #更新马上开团活动中商品位置
                        self.update_actItems_position(act_obj)
                    # 多线程抓商品
                    items_list = self.run_actItems(act_obj, prev_act)
                else:
                    print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)

                #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # 处理活动信息
                #self.procAct(act_obj, prev_act, items_list)
                # 处理活动redis信息
                self.procActRedis(act_obj, prev_act, items_list)
                #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            else:
                self.update_startact(act_obj, prev_act)
                print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) 

    # 更新开团后活动
    def update_startact(self, act, prev_act):
        if act.brandact_endtime and act.brandact_endtime != 0.0:
            end_time_s = Common.time_s(float(act.brandact_endtime)/1000)
            if prev_act and end_time_s != prev_act['end_time']:
                prev_act['end_time'] = end_time_s
                # redis
                keys = [self.worker_type, str(act.brandact_id)]
                self.redisAccess.write_jhsact(keys, prev_act)
                self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id)))

    #更新马上开团活动中商品位置
    def update_actItems_position(self, act):
        update_val_list = []
        act_id = act.brandact_id
        for item in act.brandact_itemVal_list:
            if str(item[7]) != '':
                update_val_list.append((str(item[7]),str(act_id),item[4]))
        self.mysqlAccess.updateJhsItemPosition(update_val_list)

    # 并行获取品牌团商品
    def run_actItems(self, act, prev_act):
        print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 需要抓取的item
        item_val_list = []
        # 过滤已经抓取过的商品ID列表
        item_ids = act.brandact_itemids
        if prev_act:
            prev_item_ids = prev_act["item_ids"]
            item_ids      = Common.diffSet(item_ids, prev_item_ids)

            # 如果已经抓取过的活动没有新上线商品,则退出
            if len(item_ids) == 0:
                print '# Activity no new Items'
                print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
                return None

            for item in act.brandact_itemVal_list:
                if str(item[6]) in item_ids or str(item[7]) in item_ids:
                    item_val_list.append(item)
        else:
            item_val_list = act.brandact_itemVal_list

        # 如果活动没有商品, 则退出
        if len(item_ids) == 0:
            print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name)
            return None

        print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        # 多线程 控制并发的线程数
        if len(item_val_list) > Config.item_max_th:
            m_itemsObj = JHSItemM('main', Config.item_max_th)
        else: 
            m_itemsObj = JHSItemM('main', len(item_val_list))
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity find new Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        return item_list

    # To merge activity
    def mergeAct(self, act, prev_act):
        if prev_act:
            # 合并本次和上次抓取的商品ID列表
            prev_item_ids  = prev_act["item_ids"]
            act.brandact_itemids   = Common.unionSet(act.brandact_itemids, prev_item_ids)

            # 取第一次的活动抓取时间
            act.crawling_time = Common.str2timestamp(prev_act["crawl_time"])

            if not act.brandact_name or act.brandact_name == '':
                act.brandact_name = prev_act["act_name"]
            if not act.brandact_url or act.brandact_url == '':
                act.brandact_url = prev_act["act_url"]
            if not act.brandact_position or str(act.brandact_position) == '0':
                act.brandact_position = prev_act["act_position"]
            if not act.brandact_enterpic_url or act.brandact_enterpic_url == '':
                act.brandact_enterpic_url = prev_act["act_enterpic_url"]
            if not act.brandact_remindNum or str(act.brandact_remindNum) == '0':
                act.brandact_remindNum = prev_act["act_remindnum"]
            if not act.brandact_coupons or act.brandact_coupons == []:
                act.brandact_coupon = prev_act["act_coupon"]
                act.brandact_coupons = prev_act["act_coupons"].split(Config.sep)
            if not act.brandact_starttime or act.brandact_starttime == 0.0: 
                act.brandact_starttime = Common.str2timestamp(prev_act["start_time"])
            if not act.brandact_endtime or act.brandact_endtime == 0.0:
                act.brandact_endtime = Common.str2timestamp(prev_act["end_time"])
            if not act.brandact_other_ids or act.brandact_other_ids == '':
                act.brandact_other_ids = prev_act["_act_ids"]

    # To put act db
    def putActDB(self, act, prev_act):
        # 预热信息
        if self._crawl_type == 'main':
            self.mysqlAccess.insertJhsActComing(act.outSql()) 

        # redis
        self.mergeAct(act, prev_act)
        
        if self._crawl_type == 'main':
            # mysql
            if prev_act:
                print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.updateJhsAct(act.outSqlForUpdate())
            else:
                print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.insertJhsAct(act.outSql())

        # mongo
        # 存网页
        _pages = act.outItemPage(self._crawl_type)
        self.mongofsAccess.insertJHSPages(_pages)

    # To process activity in redis
    def procActRedis(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))

        # redis
        self.mergeAct(act, prev_act)
        keys = [self.worker_type, str(act.brandact_id)]
        val = act.outTupleForRedis()
        self.redisAccess.write_jhsact(keys, val)

    # To process activity
    def procAct(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))
        # 将抓取的活动信息存入redis
        self.putActDB(act, prev_act)

    # ITEM queue
    def run_item(self, msg, _val):
        # 默认数据
        msg_val = msg["val"]
        brandact_id, brandact_name, item_val_list = msg_val
        print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_val_list) > max_th:
            m_itemsObj = JHSItemM(self._crawl_type, max_th, _val)
        else:
            m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            self.giveup_val = (brandact_id, brandact_name, giveup_items)
            raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)

        i, M = 0, 20
        if _obj == 'cat':
            M = 10
        n = 0
        while True: 
            if _crawl_type and _crawl_type != '':
                _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type)
            else:
                _key = '%s_%s' % (self.jhs_type,_obj)
            _msg = self.redisQueue.get_q(_key)

            # 队列为空
            if not _msg:
                i += 1
                if i > M:
                    print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    print '# all get num of item in queue:',n
                    break
                time.sleep(10)
                continue
            n += 1
            try:
                self.crawlPage(_obj, _crawl_type, _key, _msg, _val)
            except Exception as e:
                print '# exception err in process of JHSWorker:',e,_key,_msg

    # 删除redis数据库过期活动
    def delAct(self, _acts):
        i = 0
        for _act in _acts:
            keys = [self.worker_type, str(_act[0])]

            item = self.redisAccess.read_jhsact(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的活动
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsact(keys)
        print '# delete acts num:',i

    def delItem(self, _items):
        i = 0
        for _item in _items:
            keys = [self.worker_type, str(_item[0])]

            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的商品
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsitem(keys)
        print '# delete items num:',i

    # 查找结束的活动
    def scanEndActs(self, val):
        _acts = self.mysqlAccess.selectJhsActEnd(val)
        print '# end acts num:',len(_acts)
        # 删除已经结束的活动
        self.delAct(_acts)

    # 查找结束的商品
    def scanEndItems(self, val):
        _items = self.mysqlAccess.selectJhsItemEnd(val)
        print '# end items num:',len(_items)
        # 删除已经结束的商品
        self.delItem(_items)

    # acts redis
    def actsRedis(self):
        _acts = self.mysqlAccess.selectActsRedisdata()
        print '# acts num:',len(_acts)
        i = 0
        for _act in _acts:
            act_id = _act[2]
            #_itemids = self.mysqlAccess.selectItemsids(str(act_id))
            #item_ids = []
            #for _itemid in _itemids:
            #    item_ids.append(str(_itemid[0]))
            #    item_ids.append(str(_itemid[1]))
            #act_val = _act + (item_ids,)
            #print act_val
            #keys = [self.worker_type, str(act_id)]
            #print keys
            #if self.redisAccess.exist_jhsact(keys):
                #act_redis = self.redisAccess.read_jhsact(keys)
                #if len(act_redis) != 15:
                #    print act_redis
                #    i += 1
                #print self.redisAccess.read_jhsact(keys)
                #self.redisAccess.delete_jhsact(keys)
            #self.redisAccess.write_jhsact(keys, act_val)
            #i += 1
            #break
        print '# redis acts num:',i

    # items redis
    def itemsRedis(self):
        _items = self.mysqlAccess.selectItemRedisdata()
        print '# items num:', len(_items)
        i = 0
        #for _item in _items:
            #msg = self.message.jhsitemMsg(_item)
            #print msg
            #keys = [self.worker_type, str(_item[0])]
            #print keys
            #if self.redisAccess.exist_jhsitem(keys):
                #print self.redisAccess.read_jhsitem(keys)
                #self.redisAccess.delete_jhsitem(keys)
            #self.redisAccess.write_jhsitem(keys, msg)
            #i += 1 
            #break
        print '# redis items num:',i
Example #12
0
class JHSItemM(MyThread):
    '''A class of jhs item thread manager'''
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.JHS_Brand

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self._q_type        = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To merge item
    def mergeAct(self, item, prev_item):
        if prev_item:
            if not item.item_position or item.item_position == 0:
                item.item_position      = prev_item["item_position"]
            if not item.item_juName or item.item_juName == '':
                item.item_juName        = prev_item["item_juname"]
            if not item.item_juDesc or item.item_juDesc == '':
                item.item_juDesc        = prev_item["item_judesc"]
            if not item.item_juPic_url or item.item_juPic_url == '':
                item.item_juPic_url     = prev_item["item_jupic_url"]
            if not item.item_url or item.item_url == '':
                item.item_url           = prev_item["item_url"]
            if not item.item_oriPrice or item.item_oriPrice == '':
                item.item_oriPrice      = prev_item["item_oriprice"]
            if not item.item_actPrice or item.item_actPrice == '':
                item.item_actPrice      = prev_item["item_actprice"]
            if not item.item_discount or item.item_discount == '':
                item.item_discount      = prev_item["item_discount"]
            if not item.item_coupons or item.item_coupons == []:
                item.item_coupons       = prev_item["item_coupons"].split(Config.sep)
            if not item.item_promotions or item.item_promotions == []:
                item.item_promotions    = prev_item["item_promotions"].split(Config.sep)
            if not item.item_remindNum or item.item_remindNum == '':
                item.item_remindNum     = prev_item["item_remindnum"]
            if not item.item_isLock_time or item.item_isLock_time == '':
                if prev_item["item_islock_time"] and prev_item["item_islock_time"] != '':
                    item.item_isLock_time   = Common.str2timestamp(prev_item["item_islock_time"])
                    item.item_isLock        = prev_item["item_islock"]
            if not item.item_starttime or item.item_starttime == 0.0:
                if prev_item["start_time"] and prev_item["start_time"] != '':
                    item.item_starttime     = Common.str2timestamp(prev_item["start_time"])
            if not item.item_endtime or item.item_endtime == 0.0:
                if prev_item["end_time"] and prev_item["end_time"] != '':
                    item.item_endtime       = Common.str2timestamp(prev_item["end_time"])

    # To put item redis db
    def putItemDB(self, item):
        # redis
        keys = [self.worker_type, str(item.item_juId)]
        prev_item = self.redisAccess.read_jhsitem(keys)
        self.mergeAct(item, prev_item)
        val = item.outTupleForRedis()
        msg = self.message.jhsitemMsg(val)
        self.redisAccess.write_jhsitem(keys, msg)

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.item_crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            self.push_back(self.giveup_items, _val)
            print "# retry too many times, no get item:", _val

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item day
    def insertItemday(self, itemdaysql_list, f=False):
        if f or len(itemdaysql_list) >= Config.item_max_arg:
            if len(itemdaysql_list) > 0:
                self.mysqlAccess.insertJhsItemForDay(itemdaysql_list)
                #print '# day insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsItemForHour(itemhoursql_list)
                #print '# hour insert data to database'
            return True
        return False

    # update item lock start-end time
    def updateItemLockStartEndtime(self, itemsql):
        if itemsql:
            self.mysqlAccess.updateJhsItemLockStartEndtime(itemsql)
            #print '# update data to database'

    def updateItems(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.updateJhsItems(itemsql_list)
                #print '# update data to database'
            return True
        return False

    # To crawl item
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemdaysql_list = []
        _itemhoursql_list = []
        _itemupdatesql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # day
                    self.insertItemday(_itemdaysql_list, True)
                    _itemdaysql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    # update
                    #self.updateItems(_itemupdatesql_list, True)
                    #_itemupdatesql_list = []

                    break

                item = None
                if self._q_type == 'main':
                    # 新商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPage(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outTuple())
                    # 入库
                    iteminfoSql = item.outTuple()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                elif self._q_type == 'day':
                    # 每天商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageDay(_val)
                    # 汇聚
                    self.push_back(self.items, item.outSqlForDay())
                    # 入库
                    daySql = item.outSqlForDay()
                    _itemdaysql_list.append(daySql)
                    if self.insertItemday(_itemdaysql_list): _itemdaysql_list = []
                elif self._q_type == 'hour':
                    # 每小时商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageHour(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outTupleHour())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)

                    hourSql = item.outSqlForHour()
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []
                elif self._q_type == 'update':
                    # 更新商品
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageUpdate(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outSqlForUpdate())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)
                elif self._q_type == 'check':
                    # check商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPageUpdate(_val)
                    # 汇聚
                    # redis
                    self.putItemDB(item)
                    self.push_back(self.items, item.outSqlForUpdate())
                    # 入库
                    updateSql = item.outSqlForUpdate()
                    if updateSql:
                        self.mysqlAccess.updateJhsItem(updateSql)

                # 存网页
                if item:
                    _pages = item.outItemPage(self._q_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                print 'Not item exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                #if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                #    print _data
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e 
                        time.sleep(10)
                time.sleep(random.uniform(10,40))
Example #13
0
class JHSGroupItemWorker():
    '''A class of JHS group item channel worker'''
    def __init__(self):
        # jhs group item type
        self.worker_type    = Config.JHS_GroupItem

        self.jhs_type       = Config.JHS_TYPE   # queue type

        # message
        self.message        = Message()

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

        # DB
        # mysql access
        self.mysqlAccess    = MysqlAccess()

        # redis queue
        self.redisQueue     = RedisQueue()

        # redis access
        self.redisAccess    = RedisAccess()

        # mongodb fs access
        self.mongofsAccess  = MongofsAccess()

    def init_crawl(self, _obj, _crawl_type):
        self._obj          = _obj
        self._crawl_type   = _crawl_type

        # dial client
        self.dial_client   = DialClient()

        # local ip
        self._ip           = Common.local_ip()

        # router tag
        self._router_tag   = 'ikuai'
        #self._router_tag  = 'tpent'

        # items
        self.items         = []

        # giveup items
        self.giveup_items  = []

        # giveup msg val
        self.giveup_val    = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back_list(self, L, v):
        L.extend(v)

    def push_back_val(self, L, v):
        L.append(v)

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'groupitemcat':
            max_time = Config.json_crawl_retry
        elif _obj == 'groupitem':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

    def crawlPage(self, _key, msg, _val):
        try:
            if self._obj == 'groupitemcat':
                self.run_category(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % self._obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_category(self, msg, _val):
        category_val = msg["val"]
        refers = _val
        c_url,c_name,c_id = category_val
        print c_url,c_name,c_id
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_name,c_id)
        ajax_url_list = self.getAjaxurlList(page_val,c_url)
        if len(ajax_url_list) > 0:
            self.get_jsonitems(ajax_url_list)

    # get json ajax url
    def getAjaxurlList(self, page_val, refers=''):
        url_list = []
        page, c_name, c_id = page_val
        p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = c_name
            a_url = a_info.group(1).replace('amp;','')
            info = a_info.group(2)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            a_val = (c_id,c_name,refers,c_subNav)
            url_list.append((a_url,refers,a_val))
            i += 1
        return url_list

    # get item json list in category page from ajax url
    def get_jsonitems(self, ajax_url_list):
        # today all items val
        todayall_item_val = []
        # other sub nav items val
        item_list = []
        # process ajax url list
        item_json_index = 0
        # mongo json pages
        cat_pages = {}
        for a_url in ajax_url_list:
            # get json from ajax url
            Result_list = self.jsonpage.get_json([a_url])
            # mongo page json
            _url,_refers,_val = a_url 
            _c_id = _val[0]
            time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
            # timeStr_jhstype_webtype_itemgroupcat_catid
            key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id))
            cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list))

            if Result_list and len(Result_list) > 0:
                item_result_valList = self.jsonpage.parser_itemjson(Result_list)
                if item_result_valList and len(item_result_valList) > 0:
                    item_json_index += 1
                    # the first item list is all online items
                    if item_json_index == 1:
                        if len(item_result_valList) > 0:
                            print '# all online items.....'
                            todayall_item_val = item_result_valList
                    else:
                        self.push_back_list(item_list, item_result_valList)
                else:
                    print '# not get itemjson parse val list...'
        if len(item_list) > 0:
            self.parseItems(item_list)

        # cat pages json 
        for key in cat_pages.keys():
            _pages = (key,cat_pages[key])
            self.mongofsAccess.insertJHSPages(_pages)

    # 解析从接口中获取的商品数据
    def parseItems(self, item_list):
        print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        # 附加信息
        a_val = (self.begin_time,)
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_list) > max_th:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val)
        else:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_list)
        m_itemsObj.run()

        _items = m_itemsObj.items
        self.push_back_list(self.items,_items)
        print '# queue item num:',len(self.items)
        print '# parse item num:',len(_items)
        print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)
        if _obj == 'groupitem':
            self.processMulti(_val)
        else:
            self.processOne(_val)

    def processOne(self, _val=None):
        i, M = 0, 10
        n = 0
        while True: 
            try:
                if self._crawl_type and self._crawl_type != '':
                    _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
                else:
                    _key = '%s_%s' % (self.jhs_type, self._obj)
                _msg = self.redisQueue.get_q(_key)

                # 队列为空
                if not _msg:
                    i += 1
                    if i > M:
                        print '# all get catQ item num:',n
                        print '# not get catQ of key:',_key
                        break
                    time.sleep(10)
                    continue
                n += 1
                self.crawlPage(_key, _msg, _val)

            except Exception as e:
                print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg

    def processMulti(self, _val=None):
        if self._crawl_type and self._crawl_type != '':
            _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
        else:
            _key = '%s_%s' % (self.jhs_type, self._obj)

        try:
            self.crawlPageMulti(_key, _val)
        except Exception as e:
            print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key)

    # To crawl page
    def crawlPageMulti(self, _key, _val):
        if self._obj == 'groupitem':
            self.run_groupitem(_key, _val)
        else:
            print '# crawlPageMulti unknown obj = %s' % self._obj

    def run_groupitem(self, _key, _val):
        m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val)
        m_itemQ.createthread()
        m_itemQ.run()
        item_list = m_itemQ.items
        print '# crawl Items num: %d' % len(item_list)

    # 删除redis数据库过期商品
    def delItem(self, _items):
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            
            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.begin_time)
                # 删除过期的商品
                if now_time > end_time: self.redisAccess.delete_jhsitem(keys)

    # 把商品信息存入redis数据库中
    def putItemDB(self, _items):
        for _item in _items:
            # 忽略已经存在的商品ID
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): continue

            # 将商品基础数据写入redis
            item_val = self.message.itemInfo(_item["r_val"])
            val = self.message.itemMsg(item_val)
            self.redisAccess.write_jhsitem(keys, val)

    # 更新商品信息
    def updateItem(self, _item):
        keys = [self.worker_type, _item["item_juId"]]

        item = self.redisAccess.read_jhsitem(keys)
        if item:
            item_val = self.message.itemParseInfo(_item["r_val"])
            c = False
            if item["start_time"] != item_val["start_time"]:
                item["start_time"] = item_val["start_time"]
                c = True
            if item["end_time"] != item_val["end_time"]:
                item["end_time"] = item_val["end_time"]
                c = True
            if c:
                self.redisAccess.write_jhsitem(keys, item)

    # 查找新商品
    def selectNewItems(self, _items):
        new_items = []
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): 
                self.updateItem(_item)
                continue
            new_items.append(_item["val"])
        return new_items

    def scanEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        _items = self.mysqlAccess.selectJhsGroupItemEnd(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)

    def scanEndItemsLasthour(self):
        val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1))
        _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums for last hour end:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)
            
    def scanAliveItems(self):
        # 到结束时间后的一个小时
        val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1))
        # 查找已经开团但是没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemAlive(val)
        print "# hour all item nums:",len(_items)
        return _items

    def scanNotEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        # 查找没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val)
        i = 1
        for _item in _items:
            print i
            item_juid = str(_item[1])
            keys = [self.worker_type, item_juid]

            item = self.redisAccess.read_jhsitem(keys)
            print item
            #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]}
            #self.redisAccess.write_jhsitem(keys, _new_item)
            i += 1

    def scanCategories(self):
        category_list = self.mysqlAccess.selectJhsGroupItemCategory()
        return category_list
Example #14
0
class JHSGroupItemCrawlerM(MyThread):
    '''A class of jhs item thread manager'''
    def __init__(self, jhs_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_type = jhs_type # h:每小时, i:商品信息详情

        # appendix val
        self.a_val = a_val
        
        # activity items
        self.items = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
        #self._tag = 'tpent'

        # give up item, retry too many times
        self.giveup_items = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            self.push_back(self.giveup_items, _val)
            print "# retry too many times, no get item:", _val

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list)
                #print '# insert hour data to database'
            return True
        return False

    # insert item coming
    def insertItemComing(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemComing(itemsql_list)
                #print '# insert item coming data to database'
            return True
        return False

    # insert item position
    def insertItemPosition(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemPosition(itemsql_list)
                #print '# insert position data to database'
            return True
        return False

    # To crawl item
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemhoursql_list = []
        _itemcomingsql_list = []
        _itempositionsql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    # coming
                    self.insertItemComing(_itemcomingsql_list, True)
                    _itemcomingsql_list = []

                    # position
                    self.insertItemPosition(_itempositionsql_list, True)
                    _itempositionsql_list = []

                    break

                item = None
                crawl_type = ''
                if self.jhs_type == 'hour':
                    # 每小时一次商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItemHour(_val)
                    #print '# Hour To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5]
                    crawl_type = 'groupitem'
                    # 汇聚
                    #self.push_back(self.items, item.outTupleGroupItemHour())

                    update_Sql,hourSql = item.outTupleGroupItemHour()
                    if update_Sql:
                        self.mysqlAccess.updateJhsGroupItem(update_Sql)
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []

                elif self.jhs_type == 'new':
                    # 商品信息
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItem(_val)
                    #print '# To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5]
                    crawl_type = 'groupitemnew'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItem())

                    iteminfoSql = item.outTupleGroupItem()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    # 通知queue, task结束
                    self.queue.task_done()
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)


                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                print 'Not item exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                #traceback.print_exc()
                print '#####--Traceback Start--#####'
                tp,val,td = sys.exc_info()
                for file, lineno, function, text in traceback.extract_tb(td):
                    print "exception traceback err:%s,line:%s,in:%s"%(file, lineno, function)
                    print text
                print "exception traceback err:%s,%s,%s"%(tp,val,td)
                print '#####--Traceback End--#####'
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                    print _data
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e 
                        time.sleep(10)
                time.sleep(random.uniform(10,30))
Example #15
0
class RetryCrawler():
    '''A class of retry crawl data'''
    def __init__(self):
        # 抓取设置
        self.crawler = TBCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip = Common.local_ip()
        # router tag
        self._tag = 'ikuai'

        # wait time
        self.w_time = 1

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def getData(self, url, refers='', max_retry=20):
        page = ''
        retry = 1
        while True:
            try:
                page = self.crawler.getData(url, refers)
                break
            except Common.InvalidPageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                print '# Invalid page exception:',e
                time.sleep(self.w_time*retry)
            except Common.DenypageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                print '# Deny page exception:',e
                # 重新拨号
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))

            except Common.SystemBusyException as e:
                if retry >= max_retry:
                    break
                retry += 1
                print '# System busy exception:',e
                time.sleep(self.w_time*retry)
            except Exception as e:
                print '# exception err in retry crawler:',e
                if str(e).find('Read timed out') != -1:
                    if retry >= max_retry:
                        break
                    retry += 1
                    time.sleep(self.w_time*retry)
                elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'):
                    if retry >= max_retry:
                        break
                    retry += 1
                    
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                    time.sleep(random.uniform(10,30))
                else:
                    break

        return page
Example #16
0
class TCItemM(MyThread):
    '''A class of tc item thread manager'''
    def __init__(self, _q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex          = threading.Lock()

        self.worker_type    = Config.TC_Spot

        # message
        self.message        = Message()

        # db
        self.mysqlAccess    = MysqlAccess()   # mysql access
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self._q_type        = _q_type # new:新增商品

        # appendix val
        self.a_val          = a_val
        
        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            Common.log('# To dial router exception: %s' % e)

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.item_crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            self.push_back(self.giveup_items, _val)
            Common.log('# retry too many times, no get item:')
            Common.log(_val)

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertTCItem(iteminfosql_list)
            return True
        return False


    # To crawl item
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemdaysql_list = []
        _itemhoursql_list = []
        _itemupdatesql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    break

                item = None
                obj = 'item'
                if self._q_type == 'spot':
                    # 新商品实例
                    item = Item()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPage(_val)
                    # 汇聚
                    self.push_back(self.items, item.outSql())
                
                    # 入库
                    tickets = item.item_tickets
                    if tickets and len(tickets) > 0:
                        self.mysqlAccess.insertTCTicket(tickets)
                    iteminfoSql = item.outSql()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []

                # 存网页
                #if item:
                #    _pages = item.outItemPage(obj, self._q_type)
                #    self.mongofsAccess.insertTCPages(_pages)

                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                Common.log('# Not item exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                Common.log('# Not page exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                Common.log('# Invalid page exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                Common.log('# Unknown exception crawl item: %s' % e)
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                    Common.log(_data)
                if str(e).find('Read timed out') == -1:
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                        time.sleep(10)
                    time.sleep(random.uniform(10,40))
Example #17
0
class TCItemRedisM(MyThread):
    '''A class of tc Item redis queue'''
    def __init__(self, key, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.tc_type        = Config.TC_TYPE # tc type
        #self.item_type      = q_type        # item queue type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self.tc_queue_type  = q_type # new...
        self._key           = key   # redis queue key

        # appendix val
        self.a_val          = a_val

        # return items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            Common.log('# To dial router exception: %s' % e)

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'item':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            Common.log('# retry too many time, no get msg:')
            Common.log(msg)

    # insert item
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertTCItem(iteminfosql_list)
            return True
        return False

    # item sql list
    def crawl(self):
        _iteminfosql_list = []
        i, M = 0, 2
        n = 0
        while True:
            try:
                _data = self.redisQueue.get_q(self._key)

                # 队列为空
                if not _data:
                    # 队列为空,退出
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    i += 1
                    if i > M:
                        Common.log('# all get itemQ item num: %d' % n)
                        Common.log('# not get itemQ of key: %s' % self._key)
                        break
                    time.sleep(10)
                    continue
                n += 1
                item = None
                obj = 'item'
                if self.tc_queue_type == 'spot':
                    # 商品实例
                    item = Item()
                    #_val = _data[1]
                    _val = _data["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPage(_val)
                    # 汇聚
                    self.push_back(self.items, item.outSql())

                    # 入库
                    tickets = item.item_tickets
                    if tickets and len() > 0:
                        self.mysqlAccess.insertTCTicket(tickets)
                    iteminfoSql = item.outSql()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    continue

                # 存网页
                #if item and obj != '':
                #    _pages = item.outItemPage(obj, self.tc_queue_type)
                #    self.mongofsAccess.insertTCPages(_pages)

                # 延时
                time.sleep(1)

            except Common.NoItemException as e:
                Common.log('# Not item exception: %s' % e)

            except Common.NoPageException as e:
                Common.log('# Not page exception: %s' % e)

            except Common.InvalidPageException as e:
                self.crawlRetry(self._key, _data)
                Common.log('# Invalid page exception: %s' % e)

            except Exception as e:
                Common.log('# Unknown exception crawl item: %s' % e)
                Common.traceback_log()

                self.crawlRetry(self._key, _data)
                if str(e).find('Read timed out') == -1:
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                        time.sleep(10)
                    time.sleep(random.uniform(10,30))
Example #18
0
class JHSGroupItemQM(MyThread):
    '''A class of jhs Item redis queue'''
    def __init__(self, itemtype, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.jhs_type       = Config.JHS_TYPE # jhs type
        self.item_type      = itemtype      # item type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_queue_type = q_type     # h:每小时
        self._key           = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type)

        # appendix val
        self.a_val          = a_val

        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    # To crawl retry
    def crawlRetry(self, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'groupitemcat':
            max_time = Config.json_crawl_retry
        elif _obj == 'groupitem':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(self._key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list)
                #print '# insert hour data to database'
            return True
        return False

    # item sql list
    def crawl(self):
        _iteminfosql_list = []
        _itemhoursql_list = []
        i, M = 0, 10
        n = 0
        while True:
            try:
                _msg = self.redisQueue.get_q(self._key)

                # 队列为空
                if not _msg:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    i += 1
                    if i > M:
                        print '# all get itemQ item num:',n
                        print '# not get itemQ of key:',self._key
                        break
                    time.sleep(10)
                    continue

                n += 1
                item = None
                crawl_type = ''
                if self.jhs_queue_type == 'hour':
                    # 每小时一次商品实例
                    item = JHSItem()
                    _val = _msg["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItemHour(_val)
                    crawl_type = 'groupitem'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItemHour())

                    # 入库
                    update_Sql,hourSql = item.outTupleGroupItemHour()
                    if update_Sql:
                        self.mysqlAccess.updateJhsGroupItem(update_Sql)
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []

                elif self.jhs_queue_type == 'new':
                    # 商品信息
                    item = JHSItem()
                    _val = _msg["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItem(_val)
                    crawl_type = 'groupitemnew'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItem())

                    # 入库
                    iteminfoSql = item.outTupleGroupItem()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                # 延时
                time.sleep(1)

            except Common.NoItemException as e:
                print 'Not item exception :', e

            except Common.NoPageException as e:
                print 'Not page exception :', e

            except Common.InvalidPageException as e:
                self.crawlRetry(_msg)
                print 'Invalid page exception :', e

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()

                self.crawlRetry(_msg)
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                        time.sleep(10)
                time.sleep(random.uniform(10,30))
Example #19
0
class TCWorker:
    """A class of tc worker"""

    def __init__(self):
        # tc spot type
        self.worker_type = Config.TC_Spot
        # DB
        self.tc_type = Config.TC_TYPE  # queue type
        self.mysqlAccess = MysqlAccess()  # mysql access
        self.redisQueue = RedisQueue()  # redis queue
        self.mongofsAccess = MongofsAccess()  # mongodb fs access

        # 抓取设置
        self.crawler = TCCrawler()

        # message
        self.message = Message()

        # 抓取时间设定
        self.crawling_time = Common.now()  # 当前爬取时间
        self.begin_time = Common.now()
        self.begin_date = Common.today_s()
        self.begin_hour = Common.nowhour_s()

    def init_crawl(self, _obj, _crawl_type):
        self._obj = _obj
        self._crawl_type = _crawl_type

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._router_tag = "ikuai"
        # self._router_tag  = 'tpent'

        # items
        self.items = []

        # giveup items
        self.giveup_items = []

        # giveup msg val
        self.giveup_val = None
        self.init_log(_obj, _crawl_type)

    def init_log(self, _obj, _crawl_type):
        if not Logger.logger:
            loggername = "other"
            filename = "crawler_%s" % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time)))
            if _obj == "channel":
                loggername = "channel"
                filename = "add_%s_%s" % (_crawl_type, time.strftime("%Y%m%d%H", time.localtime(self.begin_time)))
            # elif _obj == 'item':

            Logger.config_logging(loggername, filename)

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = "%s_%s" % (_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            Common.log("# To dial router exception: %s" % e)

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg:
            return
        msg["retry"] += 1
        _retry = msg["retry"]
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == "channel":
            max_time = Config.channel_crawl_retry
        elif _obj == "item":
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            # self.push_back(self.giveup_items, msg)
            Common.log("# retry too many time, no get msg:")
            Common.log(msg)

    # To crawl page
    def crawlPage(self, _obj, _crawl_type, _key, msg, _val):
        try:
            if _obj == "channel":
                self.run_channel(msg)
            else:
                Common.log("# crawlPage unknown obj = %s" % _obj)
        except Common.InvalidPageException as e:
            Common.log("# Invalid page exception: %s" % e)
            self.crawlRetry(_key, msg)
        except Common.DenypageException as e:
            Common.log("# Deny page exception: %s" % e)
            self.crawlRetry(_key, msg)
            # 重新拨号
            try:
                self.dialRouter(4, "chn")
            except Exception as e:
                Common.log("# DailClient Exception err: %s" % e)
                time.sleep(random.uniform(10, 30))
            time.sleep(random.uniform(10, 30))
        except Common.SystemBusyException as e:
            Common.log("# System busy exception: %s" % e)
            self.crawlRetry(_key, msg)
            time.sleep(random.uniform(10, 30))
        except Common.RetryException as e:
            Common.log("# Retry exception: %s" % e)
            if self.giveup_val:
                msg["val"] = self.giveup_val
            self.crawlRetry(_key, msg)
            time.sleep(random.uniform(20, 30))
        except Exception as e:
            Common.log("# exception err: %s" % e)
            self.crawlRetry(_key, msg)
            Common.traceback_log()
            if str(e).find("Read timed out") == -1:
                # 重新拨号
                try:
                    self.dialRouter(4, "chn")
                except Exception as e:
                    Common.log("# DailClient Exception err: %s" % e)
                time.sleep(random.uniform(10, 30))

    def run_channel(self, msg):
        msg_val = msg["val"]
        c = Channel()
        c.antPage(msg_val)
        # self.items = c.channel_items
        self.run_items(c)

    # 并行获取商品
    def run_items(self, chan):
        Common.log("# Items start, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name))
        # 多线程 控制并发的线程数
        Common.log("# Items num: %d" % len(chan.channel_items))
        if len(chan.channel_items) > Config.item_max_th:
            m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th)
        else:
            m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items))
        m_itemsObj.createthread()
        m_itemsObj.putItems(chan.channel_items)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        Common.log("# find Items num: %d" % len(chan.channel_items))
        Common.log("# crawl Items num: %d" % len(item_list))
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            Common.log("# giveup Items num: %d" % len(giveup_items))
            raise Common.RetryException("# run_items: some items retry more than max times..")
        Common.log("# Items end, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name))

    def process(self, _obj, _crawl_type, _val=None):
        # self.processMulti(_obj, _crawl_type, _val)
        self.processOne(_obj, _crawl_type, _val)

    def processOne(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)

        i, M = 0, 20
        if _obj == "channel":
            M = 2
        n = 0
        while True:
            if _crawl_type and _crawl_type != "":
                _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type)
            else:
                _key = "%s_%s" % (self.tc_type, _obj)
            _msg = self.redisQueue.get_q(_key)

            # 队列为空
            if not _msg:
                i += 1
                if i > M:
                    Common.log("# not get queue of key: %s" % _key)
                    Common.log("# all get num of item in queue: %d" % n)
                    break
                time.sleep(10)
                continue
            n += 1
            try:
                self.crawlPage(_obj, _crawl_type, _key, _msg, _val)
            except Exception as e:
                Common.log("# exception err in process of TCWorker: %s , key: %s" % (e, _key))
                Common.log(_msg)

    def processMulti(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)
        if _crawl_type and _crawl_type != "":
            _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type)
        else:
            _key = "%s_%s" % (self.tc_type, _obj)

        try:
            self.crawlPageMulti(_obj, _crawl_type, _key, _val)
        except Exception as e:
            Common.log("# exception err in processMulti of TCWorker: %s, key: %s" % (e, _key))

    # To crawl page
    def crawlPageMulti(self, _obj, _crawl_type, _key, _val):
        self.run_multiitems(_key, _val)
        # Common.log('# crawlPageMulti unknown obj = %s' % _obj)

    def run_multiitems(self, _key, _val):
        mitem = TCItemRedisM(_key, self._crawl_type, 20, _val)
        mitem.createthread()
        mitem.run()
        item_list = mitem.items
        Common.log("# crawl Items num: %d" % len(item_list))
Example #20
0
class JHSActM(MyThread):
    '''A class of jhs activity item thread manager'''
    def __init__(self, jhs_type, thread_num = 15, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex      = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # appendix val
        self.a_val = a_val

        # jhs queue type
        self.jhs_type   = jhs_type # 1:即将上线品牌团频道页, 2:检查每天还没结束的活动, 3:新增活动
        
        # activity items
        self.items      = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
        #self._tag = 'tpent'

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            #self.items.append(v)
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.act_crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            print "# retry too many times, no get item:", _val

    # insert act
    def insertAct(self, actsql_list, f=False):
        if f or len(actsql_list) >= Config.act_max_arg:
            if len(actsql_list) > 0:
                self.mysqlAccess.insertJhsAct(actsql_list)
            return True
        return False

    # insert act day
    def insertActday(self, actdaysql_list, f=False):
        if f or len(actdaysql_list) >= Config.act_max_arg:
            if len(actdaysql_list) > 0:   
                self.mysqlAccess.insertJhsActDayalive(actdaysql_list)
            return True
        return False

    # insert act hour
    def insertActhour(self, acthoursql_list, f=False):
        if f or len(acthoursql_list) >= Config.act_max_arg:
            if len(acthoursql_list) > 0:
                self.mysqlAccess.insertJhsActHouralive(acthoursql_list)
            return True
        return False

    # insert act coming
    def insertActcoming(self, actcomingsql_list, f=False):
        if f or len(actcomingsql_list) >= Config.act_max_arg:
            if len(actcomingsql_list) > 0:
                self.mysqlAccess.insertJhsActComing(actcomingsql_list)
            return True
        return False

    # To crawl item
    def crawl(self):
        # sql list
        #_actsql_list, _actdaysql_list, _acthoursql_list = [], [], []
        _actcomingsql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    self.insertActcoming(_actcomingsql_list, True)
                    _actcomingsql_list = []

                    break

                item = None
                crawl_type = ''
                if self.jhs_type == 1:
                    # 品牌团实例 即将上线
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageComing(_val)
                    print '# To crawl coming activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'coming'
                    # 汇聚
                    self.push_back(self.items, item.outTupleForComing())
                    crawling_confirm,sql = item.outTupleForComing()
                    # 入库
                    if crawling_confirm == 1:
                        _actcomingsql_list.append(sql)
                    if self.insertActcoming(_actcomingsql_list): _actcomingsql_list = []
                elif self.jhs_type == 2:
                    # 品牌团实例 检查活动新加商品
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageHourcheck(_val)
                    #print '# To check activity val : ', Common.now_s(), _val[0], _val[1]
                    crawl_type = 'hourcheck'
                    # 汇聚
                    self.push_back(self.items, item.outTupleForHourcheck())
                elif self.jhs_type == 3:
                    # 品牌团实例
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPage(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'brand'

                    # 汇聚
                    self.push_back(self.items, item.outTuple())

                elif self.jhs_type == 4:
                    # 还没有开团的品牌团实例
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageMain(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'main'

                    # 汇聚
                    self.push_back(self.items, item.outTuple())

                elif self.jhs_type == 5:
                    # 解析品牌团数据
                    item = JHSAct()

                    # 信息处理
                    _val  = _data[1]
                    item.antPageParser(_val)
                    #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3]
                    crawl_type = 'parser'

                    # 汇聚
                    self.push_back(self.items, item.outTupleParse())
                else:
                    self.queue.task_done()
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                    
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoActivityException as e:
                print 'Not activity exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()

                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                        time.sleep(10)
                time.sleep(random.uniform(10,30))