def test_connection(cluster, log_write_url=None, girder_token=None): cluster_id = cluster['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: # First fetch the cluster with this 'admin' token so we get the # passphrase filled out. r = requests.get(cluster_url, headers=headers) check_status(r) cluster = r.json() with get_connection(girder_token, cluster) as conn: status = 'running' # Test can we can connect to cluster output = conn.execute('pwd') if len(output) < 1: log.error('Unable connect to cluster') status = 'error' r = requests.patch(cluster_url, headers=headers, json={'status': status}) check_status(r) except Exception as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) # Log the error message log.exception(ex)
def test_put_get(self): stream = StringIO.StringIO(self.test_data) with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.put(stream, self.test_file_path) with conn.get(self.test_file_path) as get_stream: self.assertEqual(get_stream.read(), self.test_data)
def test_is_file(self): stream = StringIO.StringIO(self.test_data) with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.put(stream, self.test_file_path) self.assertTrue(conn.isfile(self.test_file_path)) self.assertFalse(conn.isfile(self.test_case_dir))
def wrapped(event, **kwargs): if 'params' in event.info and key in event.info['params']: id = event.info['params'][key] elif key in event.info: id = event.info[key] else: # Request is not well formed, delegate to core. return cluster_id = None try: decoded_id = urllib.parse.unquote_plus(id) (cluster_id, path) = _parse_id(decoded_id) # If we have successfully decoded the id, then prevent the default event.preventDefault() except ValueError: pass if cluster_id is not None: cluster = Cluster().load(cluster_id, user=getCurrentUser()) token = getCurrentToken() with get_connection(token['_id'], cluster) as conn: response = func(conn, path, cluster=cluster, encoded_id=id) event.addResponse(response)
def download_path_from_cluster(cluster, girder_token, parent, path, upload=False, include=None, exclude=None): """ Download a given path on a cluster into an assetstore. :params cluster: The cluster to to download the path from. :params girder_token: The Girder token to use to access Girder. :params parent: The target folder to import the path into. :params path: The path on the cluster to download. :params upload: Indicate if the import should upload the file data or just the metadata, the default is False. :params include: List of include regexs :params exclude: List of exclude regexs, """ assetstore_base_url = get_assetstore_url_base(cluster) assetstore_id = get_assetstore_id(girder_token, cluster) with get_connection(girder_token, cluster) as conn: download_path(conn, girder_token, parent, path, assetstore_base_url, assetstore_id, upload=upload, include=include, exclude=exclude)
def _get_path(cluster, path): basename = os.path.basename(path) token = getCurrentToken() with get_connection(token['_id'], cluster) as conn: entry = conn.stat(path) entry_id = _generate_id(cluster['_id'], path) parent_id = _generate_id(cluster['_id'], os.path.dirname(path)) model = { '_id': entry_id, 'size': entry.st_size, 'name': basename, 'created': _mtime_isoformat(entry.st_mtime), 'updated': _mtime_isoformat(entry.st_mtime) } if stat.S_ISDIR(entry.st_mode): model['_modelType'] = 'folder' model['description'] = '' model['parentCollection'] = 'folder' model['parentId'] = parent_id model['public'] = False return model elif stat.S_ISREG(entry.st_mode): model['_modelType'] = "file" model['assetstoreId'] = None model["exts"] = [os.path.splitext(basename)[1]] model['itemId'] = parent_id, model['mimeType'] = 'application/octet-stream' return model
def tearDown(self): try: with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.execute('rm -rf %s' % self.test_case_dir) except Exception: pass
def setUp(self): status_url = '%s/login' % newt_base_url data = { 'username': NewtClusterConnectionTestCase.USER, 'password': NewtClusterConnectionTestCase.PASSWORD } r = requests.post(status_url, data=data) json_resp = r.json() self.assertTrue(json_resp['auth']) self.session_id = json_resp['newt_sessionid'] self._cluster = {'type': 'newt', 'config': {'host': 'cori'}} self._girder_token = 'dummy' def session_id(url, request): return self._session_id(url, request) url = '/api/v1/newt/sessionId' self.me = httmock.urlmatch(path=r'^%s$' % url, method='GET')(session_id) self.scratch_dir = '/global/cscratch1/sd/%s' % NewtClusterConnectionTestCase.USER self.test_data = 'nothing to see here!' self.test_case_dir = '%s/cumulus' % self.scratch_dir self.test_file_path = '%s/test.txt' % self.test_case_dir self.test_dir = '%s/cumulus' % self.test_case_dir # Create directory for test case with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.mkdir(self.test_case_dir)
def test_connection(cluster, log_write_url=None, girder_token=None): cluster_id = cluster['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: # First fetch the cluster with this 'admin' token so we get the # passphrase filled out. r = requests.get(cluster_url, headers=headers) check_status(r) cluster = r.json() with get_connection(girder_token, cluster) as conn: status = 'running' # Test can we can connect to cluster output = conn.execute('pwd') if len(output) < 1: log.error('Unable connect to cluster') status = 'error' r = requests.patch( cluster_url, headers=headers, json={'status': status}) check_status(r) except Exception as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) # Log the error message log.exception(ex)
def create_paraview_job(task, *args, **kwargs): _update_cluster_config(task, kwargs['cluster']) task.logger.info('Validating args passed to flow.') validate_args(kwargs) cluster = kwargs.pop('cluster') # Save the cluster in the taskflow for termination task.taskflow.set_metadata('cluster', cluster) client = create_girder_client(task.taskflow.girder_api_url, task.taskflow.girder_token) task.taskflow.logger.info('Creating ParaView job.') task.logger.info('Load ParaView submission script.') base_path = os.path.dirname(__file__) script_path = os.path.join(base_path, 'pvw.sh') if not os.path.exists(script_path): msg = 'Script path %s does not exists.' % script_path task.logger.info(msg) raise Exception(msg) with open(script_path, 'r') as fp: commands = fp.read().splitlines() body = { 'name': 'paraview', 'commands': commands, 'input': [], 'output': [] } job = client.post('jobs', data=json.dumps(body)) task.logger.info('ParaView job created: %s' % job['_id']) task.taskflow.logger.info('ParaView job created.') task.taskflow.set_metadata('jobs', [job]) # Upload the visualizer code task.logger.info('Uploading visualizer') viz_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../../../../', 'node_modules/pvw-visualizer/server/pvw-visualizer.py')) if not os.path.exists(viz_path): task.logger.error( 'Unable to locate pvw-visualizer.py for upload. (%s)' % viz_path) return target_dir = job_directory(cluster, job) target_path = os.path.join(target_dir, 'pvw-visualizer.py') with get_connection(task.taskflow.girder_token, cluster) as conn: conn.makedirs(target_dir) with open(viz_path, 'r') as fp: conn.put(fp, target_path) submit_paraview_job.delay(cluster, job, *args, **kwargs)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def remove_output(task, cluster, job, girder_token): try: with get_connection(girder_token, cluster) as conn: rm_cmd = 'rm -rf %s' % job['dir'] conn.execute(rm_cmd) except EOFError: # Try again task.retry(countdown=5)
def create_paraview_job(task, *args, **kwargs): _update_cluster_config(task, kwargs['cluster']) task.logger.info('Validating args passed to flow.') validate_args(kwargs) cluster = kwargs.pop('cluster') # Save the cluster in the taskflow for termination task.taskflow.set_metadata('cluster', cluster) client = create_girder_client( task.taskflow.girder_api_url, task.taskflow.girder_token) task.taskflow.logger.info('Creating ParaView job.') task.logger.info('Load ParaView submission script.') base_path = os.path.dirname(__file__) script_path = os.path.join(base_path, 'pvw.sh') if not os.path.exists(script_path): msg = 'Script path %s does not exists.' % script_path task.logger.info(msg) raise Exception(msg) with open(script_path, 'r') as fp: commands = fp.read().splitlines() body = { 'name': 'paraview', 'commands': commands, 'input': [], 'output': [] } job = client.post('jobs', data=json.dumps(body)) task.logger.info('ParaView job created: %s' % job['_id']) task.taskflow.logger.info('ParaView job created.') task.taskflow.set_metadata('jobs', [job]) # Upload the visualizer code task.logger.info('Uploading visualizer') viz_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '../', '../', '../','../', 'node_modules/pvw-visualizer/server/pvw-visualizer.py')) if not os.path.exists(viz_path): task.logger.error('Unable to local pvw-visualizer.py for upload.') return target_dir = job_directory(cluster, job) target_path = os.path.join(target_dir, 'pvw-visualizer.py') with get_connection(task.taskflow.girder_token, cluster) as conn: conn.makedirs(target_dir) with open(viz_path, 'r') as fp: conn.put(fp, target_path) submit_paraview_job.delay(cluster, job, *args, **kwargs)
def test_remove(self): stream = StringIO.StringIO(self.test_data) with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.put(stream, self.test_file_path) self.assertTrue(conn.isfile(self.test_file_path)) conn.remove(self.test_file_path) with self.assertRaises(NewtException) as cm: conn.stat(self.test_file_path)
def test_list(self): with get_connection(self._girder_token, self._cluster) as conn: for path in conn.list(self._test_case_dir): self.assertEqual(len(path.keys()), 6) self.assertTrue('name' in path) self.assertTrue('group' in path) self.assertTrue('user' in path) self.assertTrue('mode' in path) self.assertTrue('date' in path) self.assertTrue('size' in path)
def create_openfoam_job(task, *args, **kwargs): # Girder client client = create_girder_client( task.taskflow.girder_api_url, task.taskflow.girder_token) # Save the cluster in the taskflow for termination cluster = kwargs.pop('cluster') task.taskflow.set_metadata('cluster', cluster) # Create job definition task.taskflow.logger.info('Creating OpenFoam job.') body = { 'name': 'openfoam_run', 'commands': [ 'python $PWD/simput-unpack.py $PWD/input-deck.json $PWD', 'docker start of_v1612_plus', 'docker exec -t of_v1612_plus $PWD/DockerRun $PWD' ], 'input': [ { 'folderId': kwargs['input']['folder']['id'], 'path': '.' }, { 'folderId': kwargs['input']['project']['folder']['id'], 'path': '.' } ], 'output': [ ] } # Register job in girder + attach to taskflow job = client.post('jobs', data=json.dumps(body)) task.logger.info('OpenFOAM job created: %s' % job['_id']) task.taskflow.logger.info('OpenFOAM job created.') task.taskflow.set_metadata('jobs', [job]) # Capture job working directory target_dir = job_directory(cluster, job) task.taskflow.set_metadata('dataDir', target_dir) source_path = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../../../../', 'node_modules/simput/bin/unpack/simput-unpack.py')) target_path = os.path.join(target_dir, 'simput-unpack.py') # Upload unpack script with get_connection(task.taskflow.girder_token, cluster) as conn: conn.makedirs(target_dir) with open(source_path, 'r') as fp: conn.put(fp, target_path) # Move to the next task submit_open_foam_job.delay(cluster, job, *args, **kwargs)
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def upload_file(cluster, girder_token, file, path): """ Upload a file to a cluster :param cluster: The cluster to upload to. :param girder_tokebn: The Grider token for Girder access. :param file: The Girder file object. :param path: The path on the cluster to upload to. """ girder_client = GirderClient(apiUrl=cumulus.config.girder.baseUrl) girder_client.token = girder_token with get_connection(girder_token, cluster) as conn: conn.makedirs(os.path.dirname(path)) _upload_file(conn, girder_client, file, path)
def download_job_input_folders(cluster, job, log_write_url=None, girder_token=None, submit=True): job_dir = job_directory(cluster, job) with get_connection(girder_token, cluster) as conn: for input in job['input']: if 'folderId' in input and 'path' in input: folder_id = input['folderId'] path = input['path'] upload_path(conn, girder_token, folder_id, os.path.join(job_dir, path)) if submit: submit_job.delay(cluster, job, log_write_url=log_write_url, girder_token=girder_token)
def download_job_input(cluster, job, log_write_url=None, girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) # Create job directory with get_connection(girder_token, cluster) as conn: conn.mkdir(job_directory(cluster, job)) log.info('Downloading input for "%s"' % job['name']) if parse('input.itemId').find(job): download_job_input_items(cluster, job, log_write_url=log_write_url, girder_token=girder_token) else: download_job_input_folders(cluster, job, log_write_url=log_write_url, girder_token=girder_token)
def test_get_ssh_connection(self, connect, from_private_key_file): cluster = { '_id': self._cluster_id, 'config': { 'ssh': { 'user': '******', 'key': self._cluster_id, 'passphrase': 'test' }, 'host': 'localhost' }, 'type': 'trad' } with get_connection('girder_token', cluster) as ssh: self.assertTrue(isinstance(ssh, SshClusterConnection))
def download_job_input(cluster, job, log_write_url=None, girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) # Create job directory with get_connection(girder_token, cluster) as conn: conn.makedirs(job_directory(cluster, job)) log.info('Downloading input for "%s"' % job['name']) if parse('input.itemId').find(job): download_job_input_items(cluster, job, log_write_url=log_write_url, girder_token=girder_token) else: download_job_input_folders(cluster, job, log_write_url=log_write_url, girder_token=girder_token)
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None, girder_token=None): status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) headers = {'Girder-Token': girder_token} assetstore_base_url = get_assetstore_url_base(cluster) assetstore_id = get_assetstore_id(girder_token, cluster) if not job_dir: job_dir = job['dir'] try: with get_connection(girder_token, cluster) as conn: for output in job['output']: if 'folderId' in output and 'path' in output: folder_id = output['folderId'] path = os.path.join(job_dir, output['path']) download_path(conn, girder_token, folder_id, path, assetstore_base_url, assetstore_id) except HttpError as e: job['status'] = JobState.ERROR url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) logger = get_post_logger('job', girder_token, url) logger.exception(e.responseText) r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) command.send_task( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=None, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r)
def setUp(self): status_url = '%s/login' % newt_base_url data = { 'username': NewtClusterConnectionTestCase.USER, 'password': NewtClusterConnectionTestCase.PASSWORD } r = requests.post(status_url, data=data) json_resp = r.json() self.assertTrue(json_resp['auth']) self.session_id = json_resp['newt_sessionid'] self._cluster = { 'type': 'newt', 'config': { 'host': 'cori' } } self._girder_token = 'dummy' def session_id(url, request): return self._session_id(url, request) url = '/api/v1/newt/sessionId' self.me = httmock.urlmatch( path=r'^%s$' % url, method='GET')(session_id) self.scratch_dir = '/global/cscratch1/sd/%s' % NewtClusterConnectionTestCase.USER self.test_data = 'nothing to see here!' self.test_case_dir = '%s/cumulus' % self.scratch_dir self.test_file_path = '%s/test.txt' % self.test_case_dir self.test_dir = '%s/cumulus' % self.test_case_dir # Create directory for test case with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.mkdir(self.test_case_dir)
def test_execute(self): with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: self.assertEqual(conn.execute('ls /bin/ls'), '/bin/ls')
def setUp(self): # Create directory for test case with get_connection(self._girder_token, self._cluster) as conn: conn.mkdir(self._test_case_dir) conn.put(StringIO.StringIO(), '%s/test.txt' % self._test_case_dir)
def test_mkdir(self): with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.mkdir(self.test_dir) self.assertFalse(conn.isfile(self.test_dir))
def test_stat(self): with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: conn.stat(self.test_case_dir)
def tearDown(self): try: with get_connection(self._girder_token, self._cluster) as conn: conn.execute('rm -rf %s' % self._test_case_dir) except Exception: raise
def test_perms_to_mode(self): test_perms = 'drwxr-xr-x' with httmock.HTTMock(self.me): with get_connection(self._girder_token, self._cluster) as conn: self.assertEqual(conn._perms_to_mode(test_perms), 16877)
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put( fp, os.path.normpath( os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath( os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature('cumulus.tasks.cluster.terminate_cluster', args=(cluster, ), kwargs={ 'log_write_url': cluster_log_url, 'girder_token': girder_token }) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int( get_queue_adapter(cluster, conn).number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.makedirs(job_dir) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def create_paraview_job(task, *args, **kwargs): _update_cluster_config(task, kwargs['cluster']) task.logger.info('Validating args passed to flow.') validate_args(kwargs) cluster = kwargs.pop('cluster') # Save the cluster in the taskflow for termination task.taskflow.set_metadata('cluster', cluster) client = create_girder_client(task.taskflow.girder_api_url, task.taskflow.girder_token) task.taskflow.logger.info('Creating ParaView job.') task.logger.info('Load ParaView submission script.') base_path = os.path.dirname(__file__) script_path = os.path.join(base_path, 'pvw.sh') if not os.path.exists(script_path): msg = 'Script path %s does not exists.' % script_path task.logger.info(msg) raise Exception(msg) with open(script_path, 'r') as fp: commands = fp.read().splitlines() body = { 'name': 'paraview', 'commands': commands, 'input': [], 'output': [] } job = client.post('jobs', data=json.dumps(body)) task.logger.info('ParaView job created: %s' % job['_id']) task.taskflow.logger.info('ParaView job created.') task.taskflow.set_metadata('jobs', [job]) # Upload the visualizer code task.logger.info('Uploading visualization application') target_dir = job_directory(cluster, job) # Gather files to copy filesToCopy = [] for localFile in LOCAL_FILES: srcFile = os.path.abspath(os.path.join(LOCAL_DIRECTORY, localFile)) dstFile = os.path.join(target_dir, localFile) if not os.path.exists(srcFile): task.logger.error('Unable to locate file for upload. (%s)' % srcFile) return filesToCopy.append((srcFile, dstFile)) # Copy the files to the server with get_connection(task.taskflow.girder_token, cluster) as conn: conn.makedirs(target_dir) for dstDir in DESTINATION_DIRECTORIES: conn.makedirs(os.path.join(target_dir, dstDir)) for srcDst in filesToCopy: with open(srcDst[0], 'r') as fp: conn.put(fp, srcDst[1]) submit_paraview_job.delay(cluster, job, *args, **kwargs)
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set([ JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING ]) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int(get_queue_adapter(cluster, conn) .number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.mkdir(job_dir, ignore_failure=True) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % ( cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set( [JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING] ) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError as ex: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) raise
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.normpath(os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath(os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def create_geometry_symlink(task, job, cluster, fileName): job_dir = job_directory(cluster, job) filePath = '%s/%s/%s' % (job_dir, job['input'][0]['path'], fileName) linkPath = '%s/%s' % (job_dir, fileName) with get_connection(task.taskflow.girder_token, cluster) as conn: conn.execute('ln -s %s %s' % (filePath, linkPath))