def set_job_resources(num_gpus=0, ram=None): """ Specifies the resources to run a job with. The available amount will greatly depend on what is available on the infrastrcture that the Foundations job orchestrator is setup on. Arguments: num_gpus {int} -- The number of GPUs to run the job with. Set to 0 to run with CPU resources instead. By default uses 1 GPU. ram {number} -- The amount of ram in GB to use while running the job. Must be greater than 0 or None. If None, no limit will be set. Returns: - This function doesn't return a value. Raises: ValueError -- If either the RAM or GPU quantity is an invalid value (ex: less than 0) or not specified. Notes: Setting the resources for a job from a given notebook or driver file will cause any additional jobs (ex: hyperparameter search) deployed from the same file and using the same process to use the same resources, unless specified otherwise. To clear specifying resources and use the default, you can pass in set_job_resources(1, None). Set num_gpus=0 to use CPU instead. """ if ram is not None and ram <= 0: raise ValueError( 'Invalid RAM quantity. Please provide a RAM quantity greater than zero.' ) if not isinstance(num_gpus, int) or num_gpus < 0: raise ValueError( 'Invalid GPU quantity. Please provide a non-negative integer GPU quantity.' ) job_resources = JobResources(num_gpus, ram) current_foundations_context().set_job_resources(job_resources)
def deploy(project_name, entrypoint, params): import os import os.path as path import json from foundations_contrib.job_deployer import deploy_job from foundations_contrib.global_state import ( current_foundations_context, redis_connection, config_manager, ) from foundations_internal.pipeline_context_wrapper import PipelineContextWrapper if project_name is None: project_name = path.basename(os.getcwd()) current_foundations_context().set_project_name(project_name) config_manager["run_script_environment"] = { "script_to_run": entrypoint, "enable_stages": False, } current_foundations_context().pipeline_context().provenance.user_name = ( _get_user_name_from_token() ) pipeline_context_wrapper = PipelineContextWrapper( current_foundations_context().pipeline_context() ) if params is not None: with open("foundations_job_parameters.json", "w+") as params_file: json.dump(params, params_file) return deploy_job(pipeline_context_wrapper, None, {})
def _log_metric_in_running_job(key, value): from foundations_contrib.global_state import message_router, current_foundations_context from foundations_events.producers.metric_logged import MetricLogged project_name = current_foundations_context().project_name() job_id = current_foundations_context().job_id() metric_logged_producer = MetricLogged(message_router, project_name, job_id, key, value) metric_logged_producer.push_message()
def _log_param_in_running_job(key, value): from foundations_contrib.global_state import current_foundations_context, redis_connection project_name = current_foundations_context().project_name() job_id = current_foundations_context().job_id() _insert_parameter_name_into_projects_params_set(redis_connection, project_name, key) _insert_input_parameter_name_into_projects_input_params_set( redis_connection, project_name, key) _insert_parameter_value_into_job_run_data(redis_connection, job_id, key, value) _insert_input_parameter_name_into_job_input_parameter_data( redis_connection, job_id, key)
def _at_exit_callback(): from foundations_contrib.global_state import ( current_foundations_context, message_router, ) from foundations_contrib.archiving.upload_artifacts import upload_artifacts from foundations_events.producers.jobs import CompleteJob from foundations_events.producers.jobs import FailedJob global _exception_happened pipeline_context = current_foundations_context().pipeline_context() upload_artifacts(pipeline_context.job_id) # This try-except block should be refactored at a later date if _exception_happened: FailedJob( message_router, pipeline_context, { "type": Exception, "exception": "", "traceback": [] }, ).push_message() else: CompleteJob(message_router, pipeline_context).push_message()
def test_ram_set_less_than_or_equal_to_zero_does_not_actually_set_job_resources( self): with self.assertRaises(ValueError) as error_context: set_job_resources(self.num_gpus, self.invalid_ram) job_resources = current_foundations_context().job_resources() self.assertEqual(self.default_job_resources, job_resources)
def log_warning_if_not_running_in_job(function_if_running_in_job, *args): from foundations_contrib.global_state import log_manager, current_foundations_context if current_foundations_context().is_in_running_job(): function_if_running_in_job(*args) elif not log_manager.foundations_not_running_warning_printed(): logger = log_manager.get_logger(__name__) logger.warning('Script not run with Foundations.') log_manager.set_foundations_not_running_warning_printed()
def set_up(self): from uuid import uuid4 from foundations_events.producers.jobs import QueueJob from foundations_contrib.global_state import message_router, current_foundations_context foundations.set_project_name('default') self._job_id = str(uuid4()) pipeline_context = current_foundations_context().pipeline_context() pipeline_context.file_name = self._job_id queue_job = QueueJob(message_router, pipeline_context) queue_job.push_message()
def _set_tags(klass, job_name, tags): from foundations_contrib.global_state import current_foundations_context from foundations import set_tag pipeline_context = current_foundations_context().pipeline_context() pipeline_context.file_name = job_name if tags is not None: for key, value in tags.items(): set_tag(key, value) pipeline_context.file_name = None
def create_syncable_directory(key, directory_path=None, source_job_id=None): from foundations.artifacts.syncable_directory import SyncableDirectory from foundations_contrib.global_state import current_foundations_context from tempfile import mkdtemp if directory_path is None: directory_path = mkdtemp() try: job_id = current_foundations_context().pipeline_context().file_name except ValueError: job_id = None return SyncableDirectory(key, directory_path, job_id, source_job_id or job_id)
def save_artifact(filepath, key=None): from foundations_contrib.global_state import log_manager, current_foundations_context logger = log_manager.get_logger(__name__) foundations_context = current_foundations_context() if not foundations_context.is_in_running_job(): logger.warning('Cannot save artifact outside of job.') else: job_id = foundations_context.job_id() artifact_saver = _ArtifactSaver(logger, filepath, job_id, key) artifact_saver.save_artifact()
def set_up_job_environment(): from foundations_events.producers.jobs import QueueJob from foundations_events.producers.jobs import RunJob from foundations_contrib.global_state import ( current_foundations_context, message_router, config_manager, ) import atexit config_manager["_is_deployment"] = True _get_logger().debug( f"Foundations has been run with the following configuration:\n" f"{yaml.dump(config_manager.config(), default_flow_style=False)}") pipeline_context = current_foundations_context().pipeline_context() _set_job_state(pipeline_context) QueueJob(message_router, pipeline_context).push_message() RunJob(message_router, pipeline_context).push_message() atexit.register(_at_exit_callback) _set_up_exception_handling()
def _config(): from uuid import uuid4 from os import getcwd from foundations_contrib.global_state import ( config_manager, current_foundations_context, ) from foundations_contrib.global_state import config_manager from foundations_contrib.local_file_system_pipeline_archive import LocalFileSystemPipelineArchive from foundations_contrib.local_file_system_pipeline_listing import LocalFileSystemPipelineListing # ensure a job uuid is set current_foundations_context().pipeline_context( ).file_name = "integration-test-job" # separates test runs test_uuid = uuid4() # below is used to create archives for all different types archive_root = getcwd() + "/tmp/archives_{}".format(test_uuid) archive_implementation = { "archive_type": LocalFileSystemPipelineArchive, "constructor_arguments": [archive_root], } config_manager["archive_listing_implementation"] = { "archive_listing_type": LocalFileSystemPipelineListing, "constructor_arguments": [archive_root], } config_manager[ "persisted_data_archive_implementation"] = archive_implementation config_manager[ "provenance_archive_implementation"] = archive_implementation config_manager[ "job_source_archive_implementation"] = archive_implementation config_manager["artifact_archive_implementation"] = archive_implementation config_manager[ "miscellaneous_archive_implementation"] = archive_implementation
import foundations from foundations_contrib.global_state import current_foundations_context, redis_connection foundations.log_metric('ugh', 10) with open('thomas_text.txt', 'w') as f: f.write('ugh_square') foundations.save_artifact('thomas_text.txt', 'just_some_artifact') foundations.log_param('blah', 20) redis_connection.set('foundations_testing_job_id', current_foundations_context().pipeline_context().job_id)
def _create_job_spec(self, job_mount_path, working_dir_root_path, job_results_root_path, container_config_root_path, job_id, project_name, username, worker_container_overrides): from foundations_contrib.global_state import current_foundations_context worker_container = { 'volumes': { job_mount_path: { "bind": "/job", "mode": "rw" }, job_results_root_path: { "bind": job_results_root_path, "mode": "rw" }, container_config_root_path: { "bind": "/root/.foundations/config", "mode": "rw" }, working_dir_root_path: { "bind": working_dir_root_path, "mode": "rw" } }, "working_dir": "/job/job_source", # [ # { # 'name': 'logging', # 'mountPath': '/root/.foundations/logs', # }, # { # 'name': 'execution-config', # 'mountPath': '/root/.foundations/config/execution' # } # ] 'environment': { "FOUNDATIONS_USER": username, "FOUNDATIONS_JOB_ID": job_id, "FOUNDATIONS_PROJECT_NAME": project_name, "PYTHONPATH": "/job/", "FOUNDATIONS_HOME": "/root/.foundations/", "FOUNDATIONS_TOKEN": user_token() }, "network": "foundations-atlas" } if current_foundations_context().job_resources().ram is not None: worker_container['mem_limit'] = int( current_foundations_context().job_resources().ram) if (current_foundations_context().job_resources().num_gpus is not None and current_foundations_context().job_resources().num_gpus > 0): worker_container[ 'image'] = 'us.gcr.io/dessa-atlas/worker-gpu:latest' worker_container['runtime'] = 'nvidia' else: worker_container['image'] = 'us.gcr.io/dessa-atlas/worker:latest' worker_container['runtime'] = 'runc' for override_key in ['command', 'image', 'working_dir', 'entrypoint']: if override_key in worker_container_overrides: worker_container[override_key] = worker_container_overrides[ override_key] if self._config['run_script_environment']['script_to_run']: worker_container['entrypoint'] = self._config[ 'run_script_environment']['script_to_run'] if 'args' in worker_container_overrides: worker_container['command'] = worker_container_overrides['args'] # # if not has_gpus: # worker_container['env'] += [{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': ''}] for override_key in ['environment', 'volumes']: if override_key in worker_container_overrides: worker_container[override_key] = { **worker_container[override_key], **worker_container_overrides[override_key] } if 'resources' in worker_container_overrides: for override_key in ['limits', 'requests']: if override_key in worker_container_overrides['resources']: worker_container['resources'][override_key].update( worker_container_overrides['resources'][override_key]) return worker_container
def _job_resources(self): from foundations_contrib.global_state import current_foundations_context return current_foundations_context().job_resources()
def _configure(): from foundations_contrib.global_state import current_foundations_context current_foundations_context().pipeline_context( ).file_name = 'integration-test-job'
def test_set_job_resources_sets_job_resources_object_in_current_foundations_context( self): set_job_resources(self.num_gpus, self.ram) job_resources = current_foundations_context().job_resources() self.assertEqual(self.job_resources, job_resources)
def _set_job_id(self, job_id): from foundations_contrib.global_state import current_foundations_context current_foundations_context().pipeline_context().file_name = job_id
import os import foundations from foundations_contrib.global_state import current_foundations_context, message_router from foundations_events.producers.jobs import RunJob foundations.set_project_name('default') job_id = os.environ['ACCEPTANCE_TEST_JOB_ID'] pipeline_context = current_foundations_context().pipeline_context() pipeline_context.file_name = job_id RunJob(message_router, pipeline_context).push_message() foundations.set_tag('model type', 'simple mlp') foundations.set_tag('data set', 'out of time') foundations.set_tag('what I was doing,', 'drinking tea') print('Hello World!')
def test_set_job_resources_ram_defaults_to_none(self): set_job_resources(num_gpus=self.num_gpus) job_resources = current_foundations_context().job_resources() self.assertEqual(JobResources(self.num_gpus, None), job_resources)
def set_up(self): from acceptance.cleanup import cleanup cleanup() current_foundations_context().pipeline_context( ).file_name = self.faker.uuid4()
def test_set_job_resources_num_gpus_defaults_to_zero(self): set_job_resources(ram=self.ram) job_resources = current_foundations_context().job_resources() self.assertEqual(JobResources(0, self.ram), job_resources)
import foundations import json from foundations_contrib.global_state import current_foundations_context params = foundations.load_parameters() print(current_foundations_context().job_id()) print(json.dumps(params))
def _pipeline_context(self): from foundations_contrib.global_state import current_foundations_context return current_foundations_context().pipeline_context()
def test_ram_set_to_none_is_valid_configuration(self): set_job_resources(self.num_gpus, None) expected_job_resources = JobResources(num_gpus=self.num_gpus, ram=None) job_resources = current_foundations_context().job_resources() self.assertEqual(expected_job_resources, job_resources)
def tear_down(self): current_foundations_context().reset_job_resources()
def test_gpu_set_to_negative_value_not_actually_set_job_resources(self): with self.assertRaises(ValueError) as error_context: set_job_resources(self.negative_gpus, self.ram) job_resources = current_foundations_context().job_resources() self.assertEqual(self.default_job_resources, job_resources)
def tear_down(self): current_foundations_context().pipeline_context().file_name = None