def fileTasksForFrames(self): numTotalFrames = self.ds.getNumSlices() logging.debug('numTotalFrames: %s, currentFrameNum: %d' % (numTotalFrames, self.currentFrameNum)) numFramesOutstanding = 0 while numTotalFrames > (self.currentFrameNum + 1): logging.debug('we have unpublished frames - push them') #turn our metadata to a string once (outside the loop) #mdstring = self.mdh.to_JSON() #TODO - use a URI instead newFrameNum = min(self.currentFrameNum + 100000, numTotalFrames-1) #create task definitions for each frame s = clusterIO._getSession(self.taskQueueURI) r = s.get('%s/release_rule_tasks?ruleID=%s&release_start=%d&release_end=%d' % (self.taskQueueURI, self._ruleID, self.currentFrameNum, newFrameNum), data='', headers={'Content-Type': 'application/json'}) if r.status_code == 200 and r.json()['ok']: logging.debug('Successfully posted tasks') else: logging.error('Failed on posting tasks with status code: %d' % r.status_code) self.currentFrameNum = newFrameNum numFramesOutstanding = numTotalFrames - 1 - self.currentFrameNum return numFramesOutstanding
def fileTasksForInputs(self, **kwargs): from PYME.IO import clusterIO input_names = kwargs.keys() inputs = {k : kwargs[k] if isinstance(kwargs[k], list) else clusterIO.cglob(kwargs[k], include_scheme=True) for k in input_names} numTotalFrames = len(list(inputs.values())[0]) self.currentFrameNum = 0 logger.debug('numTotalFrames = %d' % numTotalFrames) logger.debug('inputs = %s' % inputs) inputs_by_task = {frameNum: {k : inputs[k][frameNum] for k in inputs.keys()} for frameNum in range(numTotalFrames)} rule = {'template': self._taskTemplate, 'inputsByTask' : inputs_by_task} s = clusterIO._getSession(self.taskQueueURI) r = s.post('%s/add_integer_id_rule?max_tasks=%d&release_start=%d&release_end=%d' % (self.taskQueueURI,numTotalFrames, 0, numTotalFrames), data=json.dumps(rule), headers = {'Content-Type': 'application/json'}) if r.status_code == 200: resp = r.json() self._ruleID = resp['ruleID'] logging.debug('Successfully created rule') else: logging.error('Failed creating rule with status code: %d' % r.status_code)
def _post_rule(self, timeout=3600, max_tasks=1e6, release_start=None, release_end=None): """ wrapper around add_integer_rule api endpoint""" from PYME.IO import clusterIO s = clusterIO._getSession(self.taskQueueURI) if release_start is None: cmd = '%s/add_integer_id_rule?timeout=%d&max_tasks=%d' % ( self.taskQueueURI, timeout, max_tasks) else: # TODO - can we get rid of this special casing? cmd = '%s/add_integer_id_rule?timeout=%d&max_tasks=%d&release_start=%d&release_end=%d' % ( self.taskQueueURI, timeout, max_tasks, release_start, release_end) r = s.post(cmd, data=json.dumps(self.rule), headers={'Content-Type': 'application/json'}) if r.status_code == 200: resp = r.json() self._ruleID = resp['ruleID'] logger.debug('Successfully created rule') else: logger.error('Failed creating rule with status code: %d' % r.status_code)
def _postTasks(self, task_list): if isinstance(task_list[0], string_types): task_list = '[' + ',\n'.join(task_list) + ']' else: task_list = json.dumps(task_list) s = clusterIO._getSession(self.taskQueueURI) r = s.post('%s/distributor/tasks?queue=%s' % (self.taskQueueURI, self.queueID), data=task_list, headers={'Content-Type': 'application/json'}) if r.status_code == 200 and r.json()['ok']: logging.debug('Successfully posted tasks') else: logging.error('Failed on posting tasks with status code: %d' % r.status_code)
def _mark_complete(self): """ Thin wrapper around release_rule_tasks api endpoint""" from PYME.IO import clusterIO s = clusterIO._getSession(self.taskQueueURI) r = s.get('%s/mark_release_complete?ruleID=%s' % (self.taskQueueURI, self._ruleID), data='', headers={'Content-Type': 'application/json'}) if r.status_code == 200 and r.json()['ok']: logging.debug('Successfully marked rule as complete') else: logging.error('Failed to mark rule complete with status code: %d' % r.status_code)
def _release_tasks(self, release_start, release_end): """ Thin wrapper around release_rule_tasks api endpoint""" from PYME.IO import clusterIO s = clusterIO._getSession(self.taskQueueURI) r = s.get( '%s/release_rule_tasks?ruleID=%s&release_start=%d&release_end=%d' % (self.taskQueueURI, self._ruleID, release_start, release_end), data='', headers={'Content-Type': 'application/json'}) if r.status_code == 200 and r.json()['ok']: logging.debug('Successfully released tasks (%d:%d)' % (release_start, release_end)) else: logging.error('Failed on releasing tasks with status code: %d' % r.status_code)
def post_rule(self): rule = {'template' : self._taskTemplate} if self.ds.isComplete(): queueSize = self.ds.getNumSlices() else: queueSize = 1e6 s = clusterIO._getSession(self.taskQueueURI) r = s.post('%s/add_integer_id_rule?timeout=300&max_tasks=%d' % (self.taskQueueURI,queueSize), data=json.dumps(rule), headers={'Content-Type': 'application/json'}) if r.status_code == 200: resp = r.json() self._ruleID = resp['ruleID'] logging.debug('Successfully created rule') else: logging.error('Failed creating rule with status code: %d' % r.status_code)
def _get_tasks(self): """ Query nodeserver for tasks and place them in the queue for this worker, if available Returns ------- new_tasks : bool flag to report whether _get_tasks added new tasks to the taskWorker queue """ tasks = [] queueURL = self._local_queue_url try: # ask the queue for tasks s = clusterIO._getSession(queueURL) r = s.get(queueURL + 'node/tasks?workerID=%s&numWant=50' % self.procName) if r.status_code == 200: resp = r.json() if resp['ok']: res = resp['result'] if isinstance(res, list): tasks += [(queueURL, t) for t in res] else: tasks.append((queueURL, res)) except requests.Timeout: logger.info('Read timout requesting tasks from %s' % queueURL) except Exception: import traceback logger.exception(traceback.format_exc()) if len(tasks) != 0: for t in tasks: self.inputQueue.put(t) return True else: # flag that there were no new tasks return False
def _return_task_results(self): """ File all results that this worker has completed Returns ------- """ while True: # loop over results queue until it's empty # print 'getting results' try: queueURL, taskDescr, res = self.resultsQueue.get_nowait() outputs = taskDescr.get('outputs', {}) except Queue.Empty: # queue is empty return if isinstance(res, TaskError): # failure clusterResults.fileResults(res.log_url, res.to_string()) s = clusterIO._getSession(queueURL) r = s.post(queueURL + 'node/handin?taskID=%s&status=failure' % taskDescr['id']) if not r.status_code == 200: logger.error('Returning task failed with error: %s' % r.status_code) elif res is None: # failure s = clusterIO._getSession(queueURL) r = s.post(queueURL + 'node/handin?taskID=%s&status=failure' % taskDescr['id']) if not r.status_code == 200: logger.error('Returning task failed with error: %s' % r.status_code) elif res == True: # isinstance(res, ModuleCollection): #recipe output # res.save(outputs) #abuse outputs dictionary as context s = clusterIO._getSession(queueURL) r = s.post(queueURL + 'node/handin?taskID=%s&status=success' % taskDescr['id']) if not r.status_code == 200: logger.error('Returning task failed with error: %s' % r.status_code) else: # success try: if 'results' in outputs.keys(): # old style pickled results clusterResults.fileResults(outputs['results'], res) else: if len(res.results) > 0: clusterResults.fileResults(outputs['fitResults'], res.results) if len(res.driftResults) > 0: clusterResults.fileResults(outputs['driftResults'], res.driftResults) except requests.Timeout: logger.exception('Filing results failed on timeout.') s = clusterIO._getSession(queueURL) r = s.post(queueURL + 'node/handin?taskID=%s&status=failure' % taskDescr['id']) if not r.status_code == 200: logger.error('Returning task failed with error: %s' % r.status_code) else: s = clusterIO._getSession(queueURL) r = s.post(queueURL + 'node/handin?taskID=%s&status=success' % taskDescr['id']) if not r.status_code == 200: logger.error('Returning task failed with error: %s' % r.status_code)
def _get_tasks(self, local_queue_name): """ Query nodeserver for tasks and place them in the queue for this worker, if available Parameters ---------- local_queue_name : str computer name prepended by 'PYMENodeServer: ' Returns ------- new_tasks : bool flag to report whether _get_tasks added new tasks to the taskWorker queue """ queue_URLs = distribution.getNodeInfo() queue_URLs = { k: v for k, v in queue_URLs.items() if k == local_queue_name } # loop over all queues, looking for tasks to process tasks = [] while len(tasks) == 0 and len(queue_URLs) > 0: # try queue on current machine first # TODO - only try local machine? # print queueNames if local_queue_name in queue_URLs.keys(): qName = local_queue_name queueURL = queue_URLs.pop(qName) else: logger.error('Could not find local node server') try: # ask the queue for tasks # TODO - make the server actually return a list of tasks, not just one (or implement pipelining in another way) # try: s = clusterIO._getSession(queueURL) r = s.get(queueURL + 'node/tasks?workerID=%s&numWant=50' % self.procName) # , timeout=0) if r.status_code == 200: resp = r.json() if resp['ok']: res = resp['result'] if isinstance(res, list): tasks += [(queueURL, t) for t in res] else: tasks.append((queueURL, res)) except requests.Timeout: logger.info('Read timout requesting tasks from %s' % queueURL) except Exception: import traceback logger.exception(traceback.format_exc()) if len(tasks) != 0: for t in tasks: self.inputQueue.put(t) return True else: # flag that there were no new tasks return False