예제 #1
0
class JHSGroupItemQM(MyThread):
    '''A class of jhs Item redis queue'''
    def __init__(self, itemtype, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.jhs_type       = Config.JHS_TYPE # jhs type
        self.item_type      = itemtype      # item type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_queue_type = q_type     # h:每小时
        self._key           = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type)

        # appendix val
        self.a_val          = a_val

        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    # To crawl retry
    def crawlRetry(self, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'groupitemcat':
            max_time = Config.json_crawl_retry
        elif _obj == 'groupitem':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(self._key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list)
                #print '# insert hour data to database'
            return True
        return False

    # item sql list
    def crawl(self):
        _iteminfosql_list = []
        _itemhoursql_list = []
        i, M = 0, 10
        n = 0
        while True:
            try:
                _msg = self.redisQueue.get_q(self._key)

                # 队列为空
                if not _msg:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    i += 1
                    if i > M:
                        print '# all get itemQ item num:',n
                        print '# not get itemQ of key:',self._key
                        break
                    time.sleep(10)
                    continue

                n += 1
                item = None
                crawl_type = ''
                if self.jhs_queue_type == 'hour':
                    # 每小时一次商品实例
                    item = JHSItem()
                    _val = _msg["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItemHour(_val)
                    crawl_type = 'groupitem'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItemHour())

                    # 入库
                    update_Sql,hourSql = item.outTupleGroupItemHour()
                    if update_Sql:
                        self.mysqlAccess.updateJhsGroupItem(update_Sql)
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []

                elif self.jhs_queue_type == 'new':
                    # 商品信息
                    item = JHSItem()
                    _val = _msg["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItem(_val)
                    crawl_type = 'groupitemnew'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItem())

                    # 入库
                    iteminfoSql = item.outTupleGroupItem()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)

                # 延时
                time.sleep(1)

            except Common.NoItemException as e:
                print 'Not item exception :', e

            except Common.NoPageException as e:
                print 'Not page exception :', e

            except Common.InvalidPageException as e:
                self.crawlRetry(_msg)
                print 'Invalid page exception :', e

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()

                self.crawlRetry(_msg)
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e
                        time.sleep(10)
                time.sleep(random.uniform(10,30))
예제 #2
0
class JHSGroupItemCrawlerM(MyThread):
    '''A class of jhs item thread manager'''
    def __init__(self, jhs_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)

        # thread lock
        self.mutex = threading.Lock()

        # db
        self.mysqlAccess = MysqlAccess() # mysql access
        self.mongofsAccess = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_type = jhs_type # h:每小时, i:商品信息详情

        # appendix val
        self.a_val = a_val
        
        # activity items
        self.items = []

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._tag = 'ikuai'
        #self._tag = 'tpent'

        # give up item, retry too many times
        self.giveup_items = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def putItem(self, _item):
        self.put_q((0, _item))

    def putItems(self, _items):
        for _item in _items: self.put_q((0, _item))

    # To crawl retry
    def crawlRetry(self, _data):
        if not _data: return
        _retry, _val = _data
        _retry += 1
        if _retry < Config.crawl_retry:
            _data = (_retry, _val)
            self.put_q(_data)
        else:
            self.push_back(self.giveup_items, _val)
            print "# retry too many times, no get item:", _val

    # insert item info
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list)
                #print '# insert data to database'
            return True
        return False

    # insert item hour
    def insertItemhour(self, itemhoursql_list, f=False):
        if f or len(itemhoursql_list) >= Config.item_max_arg:
            if len(itemhoursql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list)
                #print '# insert hour data to database'
            return True
        return False

    # insert item coming
    def insertItemComing(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemComing(itemsql_list)
                #print '# insert item coming data to database'
            return True
        return False

    # insert item position
    def insertItemPosition(self, itemsql_list, f=False):
        if f or len(itemsql_list) >= Config.item_max_arg:
            if len(itemsql_list) > 0:
                self.mysqlAccess.insertJhsGroupItemPosition(itemsql_list)
                #print '# insert position data to database'
            return True
        return False

    # To crawl item
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemhoursql_list = []
        _itemcomingsql_list = []
        _itempositionsql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    # hour
                    self.insertItemhour(_itemhoursql_list, True)
                    _itemhoursql_list = []

                    # coming
                    self.insertItemComing(_itemcomingsql_list, True)
                    _itemcomingsql_list = []

                    # position
                    self.insertItemPosition(_itempositionsql_list, True)
                    _itempositionsql_list = []

                    break

                item = None
                crawl_type = ''
                if self.jhs_type == 'hour':
                    # 每小时一次商品实例
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItemHour(_val)
                    #print '# Hour To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5]
                    crawl_type = 'groupitem'
                    # 汇聚
                    #self.push_back(self.items, item.outTupleGroupItemHour())

                    update_Sql,hourSql = item.outTupleGroupItemHour()
                    if update_Sql:
                        self.mysqlAccess.updateJhsGroupItem(update_Sql)
                    _itemhoursql_list.append(hourSql)
                    if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = []

                elif self.jhs_type == 'new':
                    # 商品信息
                    item = JHSItem()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val

                    item.antPageGroupItem(_val)
                    #print '# To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5]
                    crawl_type = 'groupitemnew'
                    # 汇聚
                    self.push_back(self.items, item.outTupleGroupItem())

                    iteminfoSql = item.outTupleGroupItem()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    # 通知queue, task结束
                    self.queue.task_done()
                    continue

                # 存网页
                if item and crawl_type != '':
                    _pages = item.outItemPage(crawl_type)
                    self.mongofsAccess.insertJHSPages(_pages)


                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                print 'Not item exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                print 'Not page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                print 'Invalid page exception :', e
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                #traceback.print_exc()
                print '#####--Traceback Start--#####'
                tp,val,td = sys.exc_info()
                for file, lineno, function, text in traceback.extract_tb(td):
                    print "exception traceback err:%s,line:%s,in:%s"%(file, lineno, function)
                    print text
                print "exception traceback err:%s,%s,%s"%(tp,val,td)
                print '#####--Traceback End--#####'
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                    print _data
                # 重新拨号
                if str(e).find('Read timed out') == -1:
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        print '# DailClient Exception err:', e 
                        time.sleep(10)
                time.sleep(random.uniform(10,30))