def test(): print 'start testing' wm = ThreadPool(10) for i in range(1): wm.add_job(test_job, i, i * 0.001) wm.wait_for_complete() print 'end testing'
def test(): print 'start testing' wm = ThreadPool(10) for i in range(1): wm.add_job(test_job, i, i*0.001) wm.wait_for_complete() print 'end testing'
class Interface(object): def __init__(self): self._read_config() self._init_threadpool() def _read_config(self): self.pipe_file = Base.get_config("QUEUE", "PIPE_FILE") self.queue_size = Base.get_config("QUEUE", "QUEUE_SIZE") self.thread_pool_num = Base.get_config("THREADPOOL", "NUM") self.pipe_fd = os.open(self.pipe_file, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR) def _init_threadpool(self): self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size)) def write(self, string): print string def transcode(self, string): self.pool.add_job(self._transcode, string) def _transcode(self, filepath): print filepath time.sleep(10) print "ok" def __getattribute__(self, name): try: res = object.__getattribute__(self, name) except: res = None return res def __del__(self): os.close(self.pipe_fd)
class MessageBus(object): """ 消息总线 用于发送消息和桥接bot和命令 接收消息分发给群成员 处理消息命令,指派给相应的命令处理 供命令处理返回命令或广播命令结果 """ def __init__(self, bot_jid, stream): self.bot_jid = bot_jid self._stream = stream self.cmd_handler = CommandHandler(message_bus = self) self.admin_cmd_handler = AdminCMDHandler(message_bus = self) self._thread_pool = ThreadPool(5) self._thread_pool.start() # 启动线程池 self.logger = get_logger() return def make_message(self, to, typ, body): """ 构造消息 `to` - 接收人 JID `typ` - 消息类型 `body` - 消息主体 """ if typ not in ['normal', 'chat', 'groupchat', 'headline']: typ = 'normal' m = Message(from_jid = self.bot_jid, to_jid = to, stanza_type = typ, body = body) return m def send_to_admin(self, stanza, body): """ 给管理员发送消息 """ [self.send_message(stanza, admin, body, True) for admin in ADMINS] def send_private_msg(self, stanza, to, body): """ 发送私信 """ frm = stanza.from_jid nick = get_nick(frm) body = "[%s 悄悄对你说] %s" % (nick, body) self.send_message(stanza, to, body, True) def send_message(self, stanza, to, body, log = False): """ 发送消息 `stanza` - 消息节 `to` - 接收人 接收人不在线发送离线消息 `body` - 消息主体 `log` - 记录历史消息 """ if log: add_history(stanza.from_jid, to, body) if is_online(to): mode = get_info('mode', to) if mode == 'talk' or not mode: if isinstance(to, (str, unicode)): to = JID(to) self.logger.debug("send '{0}' to {1!r}".format(body, to)) typ = stanza.stanza_type self._stream.send(self.make_message(to, typ, body)) else: body = NOW() + ' ' + body self.logger.debug("store offline message'{0}' for {1!r}" .format(body, to)) offline_message = get_info('offline_message', to, '') offline_message += '\n' + body add_info('offline_message', offline_message, to) def send_offline_message(self, stanza): """ 发送离线消息 """ show = stanza.show frm = stanza.from_jid offline_message = get_info('offline_message', frm) if offline_message: offline_message = "离线期间的消息:\n" + offline_message m = self.make_message(frm, 'normal', offline_message) self._stream.send(m) set_online(frm, show) add_info('offline_message', '', frm) def send_all_msg(self, stanza, body): """ 给除了自己的所有成员发送消息 """ if cityid(body.strip()): return self.send_command(stanza, '-_tq ' + body.strip()) if body.strip() == 'help': return self.send_command(stanza, '-help') if body.strip() == 'ping': return self.send_command(stanza, '-_ping') mode = get_info('mode', stanza.from_jid) if mode == 'quiet': body = u'你处于{0},请使用-cd命令切换到 {1} '\ u'后发言'.format(MODES[mode], MODES['talk']) return self.send_back_msg(stanza, body) add_history(stanza.from_jid, 'all', body) members = get_members(stanza.from_jid) current = get_info('channel', stanza.from_jid, 'main') members = [m for m in members if get_info('channel', m, 'main') == current] self.logger.info("{0} send message {1} to {2!r}" .format(stanza.from_jid, body, members)) nick = get_nick(stanza.from_jid) body = "[{0}] {1}".format(nick, body) [self.send_message(stanza, m, body) for m in members] def send_back_msg(self, stanza, body): """ 发送返回消息 """ to = stanza.from_jid.bare().as_string() typ = stanza.stanza_type self._stream.send(self.make_message(to, typ, body)) def send_sys_msg(self, stanza, body): """ 发送系统消息 """ members = get_members() [self.send_message(stanza, m, body) for m in members] def send_command(self, stanza, body): """ 处理命令 为防止阻塞使用线程池处理命令 """ email = get_email(stanza.from_jid) self.logger.info("{0} run command {1}".format(stanza.from_jid, body)) if email in ADMINS: target = self.admin_cmd_handler._run_cmd else: target = self.cmd_handler._run_cmd self._thread_pool.add_job(target, stanza, body) def send_status(self, statustext, to = None): if to: to_jid = JID(to) p = Presence(status=statustext, to_jid = to_jid) else: p = Presence(status = statustext) self._stream.send(p) def send_subscribe(self, jid): """ 发送订阅 """ p1 = Presence(from_jid = self.bot_jid, to_jid = jid, stanza_type = 'subscribe') p = Presence(from_jid = self.bot_jid, to_jid = jid, stanza_type = 'subscribed') self._stream.send(p) self._stream.send(p1) def send_unsubscribe(self, jid): p1 = Presence(from_jid = self.my_jid, to_jid = jid, stanza_type = 'unsubscribe') p = Presence(from_jid = self.my_jid, to_jid = jid, stanza_type = 'unsubscribed') self._stream.send(p) self._stream.send(p1)
class Interface(object): img_url_exp = re.compile(r'http://qdimg.okjiaoyu.cn/[\S\s]*') qiniu_prefix = 'http://%s.okjiaoyu.cn/%s' def __init__(self): self._read_config() self._init_threadpool() self._get_pid() def _read_config(self): configer = Configer() self.queue_size = configer.get_configer('QUEUE', 'queue_size') self.thread_pool_num = configer.get_configer('THREADPOOL', 'num') def _init_threadpool(self): self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size)) def _get_pid(self): self.pid = os.getpid() def write(self, string): print string def local_img(self, string): from gl import LOG update_flag = False LOG.info('start local img,question id [%s]' % string) question_id = int(string) mongo = Mongo() mongo.connect('resource') mongo.select_collection('mongo_question_json') json = mongo.find_one({'question_id': question_id}, {'content': 1}) mongo.select_collection('mongo_question_html') html = str(mongo.find_one({'question_id': question_id}, {'content': 1})) #img_expr = parse("content[*].*[*]") #img_list = [match.value for match in img_expr.find(json) if isinstance(match.value,dict) and\ # 'type' in match.value.keys() and match.value['type'] == 'image'] #pprint.pprint(json) content = '' if json: content = json['content'] for key, wrap in content.items(): for idx, item in enumerate(content[key]): if isinstance(item, str): continue if isinstance(item, dict): if 'group' in item.keys(): group = item['group'] for index, item1 in enumerate(group): if isinstance( item1, dict) and 'type' in item1.keys( ) and item1['type'] == 'image': ori_url = item1['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx]['group'][index][ 'value'] = qiniu_url update_flag = True html = html.replace(ori_url, qiniu_url) if 'type' in item.keys() and item['type'] == 'image': ori_url = item['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx]['value'] = qiniu_url update_flag = True html = html.replace(ori_url, qiniu_url) if isinstance(item, list): for index, item1 in enumerate(item): if 'type' in item1.keys( ) and item1['type'] == 'image': ori_url = item1['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx][index][ 'value'] = qiniu_url update_flag = True html = html.replace(ori_url, qiniu_url) if update_flag: mongo.select_collection('mongo_question_json') json_effected = mongo.update_many({'question_id': question_id}, {'$set': { 'content': content }}) mongo.select_collection('mongo_question_html') html_effected = mongo.update_many({'question_id': question_id}, {'$set': { 'content': html }}) LOG.info('mongo update successful json[%d] -- html[%d]' % (json_effected, html_effected)) def _upload_qiniu(self, ori_url): from gl import LOG LOG.info('Original Image Url [%s]' % ori_url) if not self.img_url_exp.match(ori_url): suffix = ori_url[ori_url.rfind('.'):] qiniu_file_name = md5(ori_url).hexdigest() + suffix LOG.info('Open Refer Imgage[%s]' % ori_url) request = urllib2.Request(ori_url) response = urllib2.urlopen(request) img_data = response.read() #LOG.info('img data [%s]' % img_data) qiniu = QiniuWrap() res = qiniu.upload_data('qdimg', qiniu_file_name, img_data) if not res: qiniu_url = self.qiniu_prefix % ('qdimg', qiniu_file_name) LOG.info('[%s] local [%s] successful' % (ori_url, qiniu_url)) return qiniu_url else: LOG.error('upload qiniu error [%s]' % res) return None def transcode(self, string): self.pool.add_job(self._transcode, string) #通知idc_api转码完成 def _transcode(self, filepath): print filepath time.sleep(100) print 'ok' def kill(self): os.kill(self.pid, signal.SIGKILL) def __getattribute__(self, name): try: res = object.__getattribute__(self, name) except: res = None return res
class Crawler(): def __init__(self, myconfig): # 线程池, self.thread_pool = ThreadPool(myconfig.threadnum) # 已访问的url集合 self.visited_urls = set() # set 不是线程安全,所以这里加一把锁 self.visited_urls_lock = threading.Lock() # 未访问的url集合 self.will_visited_urls = deque() self.will_visited_urls.append(myconfig.url) self.temp_q = deque() self.cur_depth = 0 self.status = "" self.myconfig = myconfig MyLogger(myconfig.logfile, myconfig.loglevel) #MyLogger(myconfig.logfile, loglevel = 5) # debug self.db = Db() def start(self): self.status = "start" while self.cur_depth < self.myconfig.depth: if self.status == "stop": break try: while self.will_visited_urls: url = self.will_visited_urls.popleft() # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责 # 添加工作,真正执行工作是在线程里做的 self.thread_pool.add_job(self.handler, url) # # TODO: # 通知线程有活干了,这里可以看出是在将will_visited_urls的url # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗? # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知 # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都 # 添加完之后,再通知 #print ">>>>>>>> give event to threads in thread pool" # 通知线程池里的线程开始新一轮的抓取 self.thread_pool.event_do_job() # 主动退出调度,让子线程有时间可以执行 time.sleep(3) except Empty: # 需要访问的url没有了 logging.info("no url right now") finally: # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束 # 这里做的处理是如果线程池里面有线程,则睡3s,再读, # 直到线程池里的工作线程为0才停下来 # 这样才算本次深度的抓取完毕 while True: #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread) if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum: # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕 # 所以break break else: # 有线程仍然在执行,则说明, 本次深度的访问还没有结束 # 睡眠等待 time.sleep(10) #此次深度的访问结束,深度加一 self.cur_depth += 1 logging.info("crawler depth now is %s" % str(self.cur_depth)) if self.cur_depth > self.myconfig.depth: break # 从url中抓到的网页都放到了temp_q中, # 将temp_q中的网页从新给 will_visited_urls,继续 self.will_visited_urls = self.temp_q self.temp_q = deque() # 所有深度的url都抓取完毕 or 爬虫退出 self.thread_pool.stop_threads() logging.info("crawler exit") return def handler(self, url): content= self.get_html_content(url) if content == "" or content == None: # 无法获取content,直接返回 return # 添加此url为已访问过 self.add_url_to_visited(url) if content.find(self.myconfig.key) != -1: self.db.save_data(url, self.myconfig.key, content) try: hrefs = self.get_hrefs(content, url) except StandardError, se: logging.error("error: %s" % (se)) print se # log # 无法获取 hrefs return # 如果获得了hrefs if hrefs: # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问 for link in hrefs: # 最后的考验 if not self.is_url_visited(link) \ and link not in self.will_visited_urls \ and link not in self.temp_q: #print "put %s into temp_q" % link self.temp_q.append(link)
class Interface(object): img_url_exp = re.compile(r'http://qdimg.okjiaoyu.cn/[\S\s]*') qiniu_prefix = 'http://%s.okjiaoyu.cn/%s' def __init__(self): self._read_config() self._init_threadpool() self._get_pid() def _read_config(self): configer = Configer() self.queue_size = configer.get_configer('QUEUE','queue_size') self.thread_pool_num = configer.get_configer('THREADPOOL','num') def _init_threadpool(self): self.pool = ThreadPool(int(self.thread_pool_num),int(self.queue_size)) def _get_pid(self): self.pid = os.getpid() def write(self,string): print string def local_img(self,string): from gl import LOG update_flag = False LOG.info('start local img,question id [%s]' % string) question_id = int(string) mongo = Mongo() mongo.connect('resource') mongo.select_collection('mongo_question_json') json = mongo.find_one({'question_id':question_id},{'content':1}) mongo.select_collection('mongo_question_html') html = str(mongo.find_one({'question_id':question_id},{'content':1})) #img_expr = parse("content[*].*[*]") #img_list = [match.value for match in img_expr.find(json) if isinstance(match.value,dict) and\ # 'type' in match.value.keys() and match.value['type'] == 'image'] #pprint.pprint(json) content = '' if json: content = json['content'] for key,wrap in content.items(): for idx,item in enumerate(content[key]): if isinstance(item,str): continue if isinstance(item,dict): if 'group' in item.keys(): group = item['group'] for index,item1 in enumerate(group): if isinstance(item1,dict) and 'type' in item1.keys() and item1['type'] == 'image': ori_url = item1['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx]['group'][index]['value'] = qiniu_url update_flag = True html = html.replace(ori_url,qiniu_url) if 'type' in item.keys() and item['type'] == 'image': ori_url = item['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx]['value'] = qiniu_url update_flag = True html = html.replace(ori_url,qiniu_url) if isinstance(item,list): for index,item1 in enumerate(item): if 'type' in item1.keys() and item1['type'] == 'image': ori_url = item1['value'] qiniu_url = self._upload_qiniu(ori_url) if qiniu_url: content[key][idx][index]['value'] = qiniu_url update_flag = True html = html.replace(ori_url,qiniu_url) if update_flag: mongo.select_collection('mongo_question_json') json_effected = mongo.update_many({'question_id':question_id},{'$set':{'content':content}}) mongo.select_collection('mongo_question_html') html_effected = mongo.update_many({'question_id':question_id},{'$set':{'content':html}}) LOG.info('mongo update successful json[%d] -- html[%d]' % (json_effected,html_effected)) def _upload_qiniu(self,ori_url): from gl import LOG LOG.info('Original Image Url [%s]' % ori_url) if not self.img_url_exp.match(ori_url): suffix = ori_url[ori_url.rfind('.'):] qiniu_file_name = md5(ori_url).hexdigest() + suffix LOG.info('Open Refer Imgage[%s]' % ori_url) request = urllib2.Request(ori_url) response = urllib2.urlopen(request) img_data = response.read() #LOG.info('img data [%s]' % img_data) qiniu = QiniuWrap() res = qiniu.upload_data('qdimg',qiniu_file_name,img_data) if not res: qiniu_url = self.qiniu_prefix % ('qdimg',qiniu_file_name) LOG.info('[%s] local [%s] successful' % (ori_url,qiniu_url)) return qiniu_url else: LOG.error('upload qiniu error [%s]' % res) return None def transcode(self,string): self.pool.add_job(self._transcode,string) #通知idc_api转码完成 def _transcode(self,filepath): print filepath time.sleep(100) print 'ok' def kill(self): os.kill(self.pid,signal.SIGKILL) def __getattribute__(self,name): try: res = object.__getattribute__(self,name) except: res = None return res
import os import sys from thread_pool import ThreadPool def myTest(s1, s2): print s1, s2 tp = ThreadPool(3) for i in xrange(3): tp.add_job(myTest, str(i), str(i + 10)) tp.begin_to_finish()
import os import sys from thread_pool import ThreadPool def myTest(s1, s2): print s1,s2 tp = ThreadPool(3) for i in xrange(3): tp.add_job(myTest, str(i), str(i+10)) tp.begin_to_finish()