def gotResult(self, data, task, ttype): ''' 获取数据。任务分2种。 1. 商铺信息,需要抓取商铺的商品列表 2. 商品信息,需要抓取商品的基本信息 ''' # TODO refactor this if data: if ttype == 'extract': total_page, hrefs = json.loads(data) total_page = int(total_page) hrefs = json.loads(hrefs) tids = check_duplicate(self.redis, hrefs) #save_tasks(self.redis, tids) for h in hrefs: tmp_tid = self.new_task_id() log.info(h) tmp_tbody = {'task': h} tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('task_queue', cPickle.dumps(tmp_task)) task = cPickle.loads(task) page = task.tbody.get('page', 1) if page == 1 and page < total_page: tmp_tbody = task.tbody for p in xrange(page, total_page): tmp_tid = self.new_task_id() tmp_tbody['page'] = p+1 tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('extract_queue', cPickle.dumps(tmp_task)) else: save_items(json.loads(data)) else: log.debug('Got an invalid task: %s when taking task: %s' % (task, ttype))
def register( self, servicename, version_major, version_minor, nodename, client, ): ''' register the client to controller ''' clientid = self.newClientId() # 如果给定的servicename不在versions中,表示该节点是无效节点 if servicename not in self.versions: log.info("Added client: %s %s Failed. No such servicename" % (str(clientid), servicename)) return (servicename + ' is not in a known service', 0) version = self.versions[servicename] client_version = (version_major, version_minor) if client_version < version[0]: return ( 'version %s is below %s please update the client' % ( repr(client_version), repr(version[0]), ), 0, ) if client_version > version[1]: return ( 'version %s is above %s please update the controller' % ( repr(client_version), repr(version[1]), ), 0, ) self.clients[clientid] = { 'client': client, 'id': clientid, 'servicename': servicename, 'name': nodename, 'users': [], 'processing': {}, 'ip': client.broker.transport.getPeer().host, 'last_call': time.time(), } client.notifyOnDisconnect(lambda c: self.unregister(clientid)) log.info("Added client: %s %s" % (str(clientid), servicename)) return ('succeed', clientid)
def register( self, servicename, version_major, version_minor, nodename, client, ): ''' register the client to controller ''' clientid = self.newClientId() # 如果给定的servicename不在versions中,表示该节点是无效节点 if servicename not in self.versions: log.info("Added client: %s %s Failed. No such servicename" % (str(clientid), servicename)) return (servicename+' is not in a known service', 0) version = self.versions[servicename] client_version = (version_major, version_minor) if client_version < version[0]: return ( 'version %s is below %s please update the client' % ( repr(client_version), repr(version[0]), ), 0, ) if client_version > version[1]: return ( 'version %s is above %s please update the controller' % ( repr(client_version), repr(version[1]), ), 0, ) self.clients[clientid] = { 'client': client, 'id': clientid, 'servicename': servicename, 'name': nodename, 'users': [], 'processing': {}, 'ip': client.broker.transport.getPeer().host, 'last_call': time.time(), } client.notifyOnDisconnect(lambda c: self.unregister(clientid)) log.info("Added client: %s %s" % (str(clientid), servicename)) return ('succeed', clientid)
def clientPull(self, clientid): ''' ''' client = self.clients[clientid] push_queue = self.push_queue[name] # if there are no push_queue if not push_queue: # generate a new pull record and add into pull_queue defer = Deferred() pullid = self.newRequestId() pull = { 'id': pullid, 'servicename': name, 'defer': defer, 'clientid': clientid, } client['pulling'].add(pullid) self.pull_requests[pullid] = pull self.pull_queue[name].append(pullid) request = yield defer del self.pull_requests[pullid] requestid = request['id'] client['processing'].add(requestid) else: # get a request from push_queue and add into processing queue requestid = push_queue.popleft() client['processing'].add(requestid) request = self.push_requests[requestid] self.processing_timeout[requestid] = reactor.callLater( self.client_request_timeout, self.clientProcTimeout, requestid, clientid, ) log.info("Sent To: clientid %s, requestid %s." % ( clientid, request['id'], )) # return requestid, method, args, kwargs to client and # client run it. returnValue(( request['id'], request['method'], request['args'], request['kwargs'], ))
def clientReturn(self, clientid, requestid, result): ''' ''' log.info("Returned: clientid: %s, requestid: %s" % ( clientid, requestid, )) # remove this request from processing deque client = self.clients[clientid] client['processing'].discard(requestid) # try to cancel the processing request. # if occured an exception, that means the request # was already finishd. try: self.processing_timeout[requestid].cancel() del self.processing_timeout[requestid] except KeyError: # 已经处理完成 pass if requestid in self.push_requests: push = self.push_requests[requestid] if 'error' not in result: push['defer'].callback(result['result']) else: error = result['error'] push['defer'].errback( failure.Failure( pb.RemoteError( error['type'], error['value'], error['traceback'], ))) servicename = push['servicename'] # remove this request from push_queue try: self.push_queue[servicename].remove(requestid) except: pass if push['clientid'] is not None: try: self.clients[push['clientid']]['pushing'].discard( requestid) except: pass
def clientReturn(self, clientid, requestid, result): ''' ''' log.info("Returned: clientid: %s, requestid: %s" % ( clientid, requestid, )) # remove this request from processing deque client = self.clients[clientid] client['processing'].discard(requestid) # try to cancel the processing request. # if occured an exception, that means the request # was already finishd. try: self.processing_timeout[requestid].cancel() del self.processing_timeout[requestid] except KeyError: # 已经处理完成 pass if requestid in self.push_requests: push = self.push_requests[requestid] if 'error' not in result: push['defer'].callback(result['result']) else: error = result['error'] push['defer'].errback(failure.Failure( pb.RemoteError( error['type'], error['value'], error['traceback'], ))) servicename = push['servicename'] # remove this request from push_queue try: self.push_queue[servicename].remove(requestid) except: pass if push['clientid'] is not None: try: self.clients[push['clientid']]['pushing'].discard(requestid) except: pass
def gotResult(self, data, task, ttype): ''' 获取数据。任务分2种。 1. 商铺信息,需要抓取商铺的商品列表 2. 商品信息,需要抓取商品的基本信息 ''' # TODO refactor this if data: if ttype == 'extract': total_page, hrefs = json.loads(data) total_page = int(total_page) hrefs = json.loads(hrefs) tids = check_duplicate(self.redis, hrefs) #save_tasks(self.redis, tids) for h in hrefs: tmp_tid = self.new_task_id() log.info(h) tmp_tbody = {'task': h} tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('task_queue', cPickle.dumps(tmp_task)) task = cPickle.loads(task) page = task.tbody.get('page', 1) if page == 1 and page < total_page: tmp_tbody = task.tbody for p in xrange(page, total_page): tmp_tid = self.new_task_id() tmp_tbody['page'] = p + 1 tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('extract_queue', cPickle.dumps(tmp_task)) else: save_items(json.loads(data)) else: log.debug('Got an invalid task: %s when taking task: %s' % (task, ttype))
def unregister(self, clientid): ''' unregister the given clientid ''' if clientid not in self.clients: return False #client = self.clients[clientid] # 判断client中在处理的消息 # pulling, processing, pushing # for pull_id in client['pulling']: # servicename = self.pull_requests[pull_id]['servicename'] # try: # self.pull_queue[servicename].remove(pull_id) # except ValueError: # pass # del self.pull_requests[pull_id] # # processing set # if client['processing']: # for requestid in client['processing']: # servicename = self.push_requests[requestid]['servicename'] # self.addRequest(servicename, requestid) # # pushing set # for push_id in client['pushing']: # servicename = self.push_requests[push_id]['servicename'] # try: # self.push_queue[servicename].remove(push_id) # except ValueError: # pass # del self.push_requests[push_id] del self.clients[clientid] log.info("Removed client: " + str(clientid)) return True
def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'extract') log.info(repr(task)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break
def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'data') log.info('Got Task %s with reqid: %s' % (repr(task), reqid)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break
def clientFail(self, *args, **kwargs): ''' called when client failed ''' clientid = kwargs.get('clientid') log.info("%s Client Failed, reason: %s" % (clientid, kwargs.get('reason', '')))