def crawl(self): # item sql list _iteminfosql_list = [] _itemdaysql_list = [] _itemhoursql_list = [] _itemupdatesql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] break item = None obj = 'item' if self._q_type == 'main': # 新商品实例 item = Item() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 # redis #self.putItemDB(item) self.push_back(self.items, item.outSql()) # 入库 iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] # 存网页 #if item: # _pages = item.outItemPage(obj, self._q_type) # self.mongofsAccess.insertJMPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) Common.log('# Invalid page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: Common.log(_data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,40))