def generate_nextround_task_thread(self, cntRound): try: self.generate_nextround_task(cntRound) self.isGeneratingTasks = False except Exception, e: log.info(e) mail.send_mail('[error]generating task for job, jobID:' + self.jobID + ' failed, you should check why and restart the job')
def check_workermanager(self): try: self.__bigGranularityLock.acquire() for ip, workermanagerinfo in self.__workermanagerdict.items(): workermanagerstatus = get_workermanagerstatus(ip) if workermanagerstatus == status.WORKER.shutdown: log.warning('workermanager, ip:'+str(ip)+' is shutdown, we will remove all tasks on it:') log.warning(workermanagerinfo.get_taskqueue()) mail.send_mail('[warning] workermanager, ip:' + str(ip) + ' is shutdown, please check it') for task in workermanagerinfo.get_taskqueue(): pos = decodeID.taskID(task.get_ID()) self.taskPool.set_task_status(pos, status.TASK.done, status.TASK.dispatched) self.__workermanagerdict.pop(ip) break except Exception, e: log.info(e)
def generate_task_from_job_thread(self): try: desBucketName = self.destination['bucketName'] if self.is_this_job_has_finished(self.jobType): log.info('the ' + self.jobType + ' of ' + self.synctask + ' is already finished') self.currentRound = self.startRound + self.requireRoundNum - 1 self.isGeneratingTasks = False return if self.source[ 'filetype'] != type.FileType.DiskFile and self.source[ 'filetype'] != type.FileType.UrlFile: self.srcClient = clientfactory.create_client( self.source['filetype'], self.source['accessKey'], self.source['secretKey'], self.source['endpoint']) cntRound = 0 while True: roundStatus = self.get_this_round_status( desBucketName, self.synctask + '/' + self.jobType + '/round' + str(cntRound) + '/roundstatus') if roundStatus and roundStatus == 'done': cntRound += 1 else: break if roundStatus and roundStatus == 'ready': self.recover_this_round_task(cntRound) else: if cntRound == 0: self.generate_round0_task() else: self.startRound = cntRound self.generate_nextround_task(cntRound) put_roundstatus( self.desClient, desBucketName, self.synctask + '/' + self.jobType + '/round' + str(self.startRound) + '/roundstatus', 'ready') self.isGeneratingTasks = False except (ossexception.GetObjectFailed, ossexception.PutObjectFailed, ossexception.ListObjectFailed, taskexception.GenerateTaskFailed, Exception), e: log.warn(e) mail.send_mail('[error]generating task for job, jobID:' + self.jobID + ' failed, you should check why and restart the job')
def monitor_thread(): while True: try: flag = 0 WorkerManager.bigGranularityLock.acquire() for worker in WorkerManager.workerqueue: workerstatus = get_workerstatus(worker.get_port()) if workerstatus == status.WORKER.shutdown: if worker.get_status() == status.WORKER.busy: flag = 1 task = worker.get_task() WorkerManager.workerqueue.remove(worker) message = WorkerMessage(WorkerManager.ip, task['ID'], worker.get_port(), False) message.set_type(task['type']) log.warning( '[warning] ' + 'worker, port:' + str(worker.get_port()) + ' ip:' + str(WorkerManager.ip) + ' is shutdown, we will remove task, taskID:' + str(task['ID']) + ' on it') if task.get_isDeleted() == False: mail.send_mail( '[warning] ' + 'worker, port:' + str(worker.get_port()) + ' ip:' + str(WorkerManager.ip) + ' is shutdown, we will remove task, taskID:' + str(task['ID']) + ' on it') else: WorkerManager.workerqueue.remove(worker) log.warning('[warning] ' + 'worker, port:' + str(worker.get_port()) + ' ip:' + str(WorkerManager.ip) + ' is shutdown, no tasks on it') mail.send_mail('[warning] ' + 'worker, port:' + str(worker.get_port()) + ' ip:' + str(WorkerManager.ip) + ' is shutdown, no tasks on it') break except Exception, e: log.warning(e) finally:
def monitor_thread(): while True: try: MasterManager.bigGranularityLock.acquire() flag = 0 message = None for master in MasterManager.masterqueue: masterstatus = get_masterstatus(master.get_port()) if masterstatus == status.MASTER.shutdown: if master.get_status() == status.MASTER.busy: flag = 1 job = master.get_job() message = Message('127.0.0.1', job['job-ID'], master.get_port(), False) MasterManager.masterqueue.remove(master) log.warning( 'master, port:' + str(master.get_port()) + ' is shutdown, we will remove job, jobID:' + job['job-ID'] + ' on it') mail.send_mail( '[warning] master, port:' + str(master.get_port()) + ' is shutdown, we will remove job, jobID:' + job['job-ID'] + ' on it') break else: MasterManager.masterqueue.remove(master) log.warning('master, port:' + str(master.get_port()) + ' is shutdown, no jobs on it') mail.send_mail('master, port:' + str(master.get_port()) + ' is shutdown, no jobs on it') break elif masterstatus == status.MASTER.busy: set_master_token(master.get_port()) except Exception, e: log.info(e) finally:
def send_message_to_master(master_ip_port, message): retryTimes = 0 while True: try: s = xmlrpclib.ServerProxy('http://' + str(master_ip_port), allow_none=True) if s.send_message(message): log.info('send message:' + message.get_ID() + ' to master successfully, master_ip_port:' + str(master_ip_port)) return True except Exception, e: log.warning(e) log.warning('send message to master failed, master_ip_port:' + str(master_ip_port) + ', will retry') retryTimes += 1 sleep(10) if retryTimes > 5: mail.send_mail( '[error] send message to master failed, master_ip_port:' + str(master_ip_port) + ', you should check why')
def send_message(self, messageAttr): try: JobManager.bigGranularityLock.acquire() message = Message().with_attribute(messageAttr) log.info('receive message from mastermanager, job:' + str(message.get_ID())) log.info(messageAttr) isFindFlag = 0 for job in JobManager.jobqueue: if str(job.get_ID()) == str(message.get_ID()): isFindFlag = 1 if message.get_isSuccess() == True: commands.save_job_status_to_local( job.get_ID(), 'done\n') if job.get_job()['sync-enable-increment'] == 'True': save_job_lastdonetime(job.get_ID()) syncinfo_times_pp(job.get_ID()) JobManager.jobqueue.remove(job) log.info('[info] ' + job.to_string() + ' finished successfully') mail.send_mail('[info] ' + job.to_string() + ' finished successfully') else: if job.get_retrytimes() < JobManager.MAXJOBRETRYTIMES: job.set_retrytimes(job.get_retrytimes() + 1) job.set_status(status.JOB.ready) log.info('[warning] ' + job.to_string() + ' failed, we will retry, retrytimes:' + str(job.get_retrytimes())) mail.send_mail( '[warning] ' + job.to_string() + ' failed, we will retry, retrytimes:' + str(job.get_retrytimes())) else: commands.save_job_status_to_local( job.get_ID(), 'failed\n') if job.get_job( )['sync-enable-increment'] == 'True': save_job_lastdonetime(job.get_ID()) JobManager.jobqueue.remove(job) log.warning('[error] ' + job.to_string() + ' failed, please check it') mail.send_mail('[error] ' + job.to_string() + ' failed, please check it') break if isFindFlag == 0: log.info('we didn\'t have any record of ' + message.get_ID()) return True except Exception, e: log.info(e) return True