def test_remote_worker(self): experiment_name = 'test_remote_worker_' + str(uuid.uuid4()) queue_name = experiment_name logger = logs.getLogger('test_remote_worker') logger.setLevel(10) pw = subprocess.Popen([ 'studio-start-remote-worker', '--queue=' + queue_name, '--single-run', '--no-cache', '--timeout=30', '--image=peterzhokhoff/studioml' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stubtest_worker(self, experiment_name=experiment_name, runner_args=['--queue=' + queue_name, '--force-git'], config_name='test_config_http_client.yaml', test_script='tf_hello_world.py', script_args=['arg0'], expected_output='[ 2.0 6.0 ]', queue=PubsubQueue(queue_name)) workerout, _ = pw.communicate() if workerout: logger.debug("studio-start-remote-worker output: \n" + str(workerout))
def __init__(self, config): """ Setup the example publisher object, passing in the URL we will use to connect to RabbitMQ. :param config: The completion_service config dictionary """ self._config = config self._queue = self._find_queue_name(config) self._url = self._find_queue_server_url(config) # Initialize member variables self._routing_key = 'StudioML.' if self._queue is not None: self._routing_key = self._routing_key + self._queue self._rmq_lock = threading.RLock() self._connection = None self._channel = None self._stopping = False self._exchange = 'StudioML.topic' self._exchange_type = 'topic' self._logger = logs.getLogger('RabbitMQShutdown') self._logger.setLevel(logging.INFO) self._cleanup_done = False
def test_unfold_tuples(self): logger = logs.getLogger('test_stop_experiment') h = HyperparameterParser(RunnerArgs(), logger) hyperparams = [ Hyperparameter(name='a', values=[1, 2, 3]), Hyperparameter(name='b', values=[4, 5]) ] expected_tuples = [{ 'a': 1, 'b': 4 }, { 'a': 2, 'b': 4 }, { 'a': 3, 'b': 4 }, { 'a': 1, 'b': 5 }, { 'a': 2, 'b': 5 }, { 'a': 3, 'b': 5 }] self.assertEquals( sorted(h.convert_to_tuples(hyperparams), key=lambda x: str(x)), sorted(expected_tuples, key=lambda x: str(x)))
def test_experiment_lifetime(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.getLogger('test_experiment_lifetime') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config_http_client.yaml') key = 'test_experiment_lifetime' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose=debug', '--lifetime=-10m', 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def test_remote_worker_co(self): logger = logs.getLogger('test_remote_worker_co') logger.setLevel(10) tmpfile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) random_str = str(uuid.uuid4()) with open(tmpfile, 'w') as f: f.write(random_str) experiment_name = 'test_remote_worker_co_' + str(uuid.uuid4()) queue_name = experiment_name pw = subprocess.Popen([ 'studio-start-remote-worker', '--queue=' + queue_name, '--single-run', '--no-cache', '--image=peterzhokhoff/studioml' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stubtest_worker(self, experiment_name=experiment_name, runner_args=[ '--capture-once=' + tmpfile + ':f', '--queue=' + queue_name, '--force-git' ], config_name='test_config_http_client.yaml', test_script='art_hello_world.py', script_args=[], expected_output=random_str, queue=PubsubQueue(queue_name)) workerout, _ = pw.communicate() logger.debug('studio-start-remote-worker output: \n' + str(workerout)) os.remove(tmpfile)
def test_two_receivers(self): logger = logs.getLogger('test_two_receivers') logger.setLevel(10) q1 = self.get_queue() q1.clean() q2 = self.get_queue(q1.get_name()) data1 = str(uuid.uuid4()) data2 = str(uuid.uuid4()) logger.debug('data1 = ' + data1) logger.debug('data2 = ' + data2) q1.enqueue(data1) self.assertEquals(data1, q2.dequeue(timeout=self.get_timeout())) q1.enqueue(data1) q1.enqueue(data2) recv1 = q1.dequeue(timeout=self.get_timeout()) recv2 = q2.dequeue(timeout=self.get_timeout()) logger.debug('recv1 = ' + recv1) logger.debug('recv2 = ' + recv2) self.assertTrue(data1 == recv1 or data2 == recv1) self.assertTrue(data1 == recv2 or data2 == recv2) self.assertFalse(recv1 == recv2) self.assertTrue(q1.dequeue() is None) self.assertTrue(q2.dequeue() is None)
def test_remote_worker_c(self): tmpfile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) logger = logs.getLogger('test_remote_worker_c') logger.setLevel(10) experiment_name = "test_remote_worker_c_" + str(uuid.uuid4()) random_str1 = str(uuid.uuid4()) with open(tmpfile, 'w') as f: f.write(random_str1) random_str2 = str(uuid.uuid4()) queue_name = experiment_name pw = subprocess.Popen([ 'studio-start-remote-worker', '--queue=' + queue_name, '--single-run', '--no-cache', '--image=peterzhokhoff/studioml' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) db = stubtest_worker(self, experiment_name=experiment_name, runner_args=[ '--capture=' + tmpfile + ':f', '--queue=' + queue_name, '--force-git' ], config_name='test_config_http_client.yaml', test_script='art_hello_world.py', script_args=[random_str2], expected_output=random_str1, queue=PubsubQueue(queue_name), delete_when_done=False) workerout, _ = pw.communicate() if workerout: logger.debug("studio-start-remote-worker output: \n" + str(workerout)) os.remove(tmpfile) tmppath = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) if os.path.exists(tmppath): os.remove(tmppath) db.get_artifact(db.get_experiment(experiment_name).artifacts['f'], tmppath, only_newer=False) with open(tmppath, 'r') as f: self.assertEquals(f.read(), random_str2) os.remove(tmppath) db.delete_experiment(experiment_name)
def test_parse_range(self): logger = logs.getLogger('test_stop_experiment') h = HyperparameterParser(RunnerArgs(), logger) range_strs = [ '1,2,3', ':5', '2:5', '0.1:0.05:0.3', '0.1:3:0.3', '0.01:4l:10' ] gd_truths = [[1.0, 2.0, 3.0], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], [2.0, 3.0, 4.0, 5.0], [0.1, 0.15, 0.2, 0.25, 0.3], [0.1, 0.2, 0.3], [0.01, 0.1, 1, 10]] for range_str, gd_truth in zip(range_strs, gd_truths): hyperparameter = h._parse_grid("test", range_str) self.assertTrue(np.isclose(hyperparameter.values, gd_truth).all())
def __init__(self, name: str, receiver_keypath: str, sender_keypath: str = None): """ param: name - payload builder name param: receiver_keypath - file path to .pem file with recipient public key param: sender_keypath - file path to .pem file with sender private key """ super(EncryptedPayloadBuilder, self).__init__(name) # XXX Set logger verbosity level here self.logger = logs.getLogger(self.__class__.__name__) self.recipient_key_path = receiver_keypath self.recipient_key = None try: self.recipient_key =\ RSA.import_key(open(self.recipient_key_path).read()) except: msg = "FAILED to import recipient public key from: {0}"\ .format(self.recipient_key_path) self.logger.error(msg) raise ValueError(msg) self.sender_key_path = sender_keypath self.sender_key = None self.sender_fingerprint = None if self.sender_key_path is None: self.logger.error("Signing key path must be specified for encrypted payloads. ABORTING.") raise ValueError() # We expect ed25519 signing key in "private key" format try: self.sender_key =\ paramiko.Ed25519Key(filename=self.sender_key_path) if self.sender_key is None: self._raise_error("Failed to import private signing key. ABORTING.") except: self._raise_error("FAILED to open/read private signing key file: {0}"\ .format(self.sender_key_path)) self.sender_fingerprint = \ self._get_fingerprint(self.sender_key) self.simple_builder =\ UnencryptedPayloadBuilder("simple-builder-for-encryptor")
def test_stop_experiment(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.getLogger('test_stop_experiment') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config_http_client.yaml') key = 'test_stop_experiment' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose=debug', 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) # wait till experiment spins up experiment = None while experiment is None or experiment.status == 'waiting': time.sleep(1) try: experiment = db.get_experiment(key) except BaseException: pass logger.info('Stopping experiment') db.stop_experiment(key) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def __init__(self, name: str, keypath: str): """ param: name - payload builder name param: keypath - file path to .pem file with public key """ super(EncryptedPayloadBuilder, self).__init__(name) # XXX Set logger verbosity level here self.logger = logs.getLogger(self.__class__.__name__) self.key_path = keypath self.recipient_key = None try: self.recipient_key = RSA.import_key(open(self.key_path).read()) except: self.logger.error( "FAILED to import recipient public key from: {0}".format( self.key_path)) return self.simple_builder =\ UnencryptedPayloadBuilder("simple-builder-for-encryptor")
def __init__(self, config, studio_experiment_ids, config_key=None, folders=None): """ :param config: The completion_service config dictionary :param studio_experiment_ids: A list of known studio experiment ids. :param config_key: The config key with the target persistent store from which we would like to delete stuff :param folders: The folders within that persistent store we would like to delete things from. """ self.config = config self.studio_experiment_ids = studio_experiment_ids self.config_key = config_key self.folders = folders self.persistence = self.find_persistence(config, config_key) self.logger = logs.getLogger('MinioShutdown') self.logger.setLevel(logging.INFO)
import importlib import shutil import pickle import os import sys import six from studio import fs_tracker, model, logs logger = logs.getLogger('completion_service_client') try: logger.setLevel(model.parse_verbosity(sys.argv[1])) except BaseException: logger.setLevel(10) def main(): logger.setLevel(logs.DEBUG) logger.debug('copying and importing client module') logger.debug('getting file mappings') artifacts = fs_tracker.get_artifacts() files = {} logger.debug("Artifacts = {}".format(artifacts)) for tag, path in six.iteritems(artifacts): if tag not in {'workspace', 'modeldir', 'tb', '_runner'}: if os.path.isfile(path): files[tag] = path elif os.path.isdir(path): dirlist = os.listdir(path)
def __init__( self, # Name of experiment experimentId, # Config yaml file config=None, # Number of remote workers to spin up num_workers=1, # Compute requirements, amount of RAM, GPU, etc resources_needed={}, # Name of the queue for submission to a server. queue=None, # What computer resource to use, either AWS, Google, or local cloud=None, # Timeout for cloud instances cloud_timeout=100, # Bid price for EC2 spot instances bid='100%', # Keypair to use for EC2 workers ssh_keypair=None, # If true, get results that are submitted by other instances of CS resumable=False, # Whether to clean the submission queue on initialization clean_queue=True, # Whether to enable autoscaling for EC2 instances queue_upscaling=True, # Whether to delete the queue on shutdown shutdown_del_queue=False, # delay between queries for results sleep_time=1 ): self.config = model.get_config(config) self.cloud = cloud self.experimentId = experimentId self.project_name = "completion_service_" + experimentId self.resources_needed = DEFAULT_RESOURCES_NEEDED if self.config.get('resources_needed'): self.resources_needed.update(self.config.get('resources_needed')) self.resources_needed.update(resources_needed) self.wm = runner.get_worker_manager( self.config, self.cloud) self.logger = logs.getLogger(self.__class__.__name__) self.verbose_level = model.parse_verbosity(self.config['verbose']) self.logger.setLevel(self.verbose_level) self.queue = runner.get_queue(queue, self.cloud, self.verbose_level) self.queue_name = self.queue.get_name() self.clean_queue = clean_queue if self.clean_queue: self.queue.clean() self.cloud_timeout = cloud_timeout self.bid = bid self.ssh_keypair = ssh_keypair self.submitted = set([]) self.num_workers = num_workers self.resumable = resumable self.queue_upscaling = queue_upscaling self.shutdown_del_queue = shutdown_del_queue self.use_spot = cloud in ['ec2spot', 'gcspot'] self.sleep_time = sleep_time
def stubtest_worker( testclass, experiment_name, runner_args, config_name, test_script, expected_output, script_args=[], queue=LocalQueue(), wait_for_experiment=True, delete_when_done=True, test_output=True, test_workspace=True): my_path = os.path.dirname(os.path.realpath(__file__)) config_name = os.path.join(my_path, config_name) logger = logs.getLogger('stubtest_worker') logger.setLevel(10) queue.clean() with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(experiment_name) except Exception: pass os.environ['PYTHONUNBUFFERED'] = 'True' p = subprocess.Popen(['studio', 'run'] + runner_args + ['--config=' + config_name, '--verbose=debug', '--force-git', '--experiment=' + experiment_name, test_script] + script_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + sixdecode(pout)) splitpout = sixdecode(pout).split('\n') experiments = [line.split(' ')[-1] for line in splitpout if line.startswith('studio run: submitted experiment')] logger.debug("added experiments: {}".format(experiments)) db = model.get_db_provider(model.get_config(config_name)) experiment_name = experiments[0] try: experiment = db.get_experiment(experiment_name) if wait_for_experiment: while not experiment or not experiment.status == 'finished': experiment = db.get_experiment(experiment_name) if test_output: with open(db.get_artifact(experiment.artifacts['output']), 'r') as f: data = f.read() split_data = data.strip().split('\n') print(data) testclass.assertEquals(split_data[-1], expected_output) if test_workspace: check_workspace(testclass, db, experiment_name) if delete_when_done: retry(lambda: db.delete_experiment(experiment_name), sleep_time=10) return db except Exception as e: print("Exception {} raised during test".format(e)) print("worker output: \n {}".format(pout)) print("Exception trace:") print(traceback.format_exc()) raise e
def __init__( self, # Name of experiment experimentId, # Completion service configuration cs_config=None, # used to pass a studioML configuration block read by client software studio_config=None, # Studio config yaml file studio_config_file=None, shutdown_del_queue=False): # StudioML configuration self.config = model.get_config(studio_config_file) self.logger = logs.getLogger(self.__class__.__name__) self.verbose_level = model.parse_verbosity(self.config['verbose']) self.logger.setLevel(self.verbose_level) # Setup Completion Service instance properties # based on configuration self.experimentId = experimentId self.project_name = "completion_service_" + experimentId self.resumable = RESUMABLE self.clean_queue = CLEAN_QUEUE self.queue_upscaling = QUEUE_UPSCALING self.num_workers = int(cs_config.get('num_workers', 1)) self.cloud_timeout = cs_config.get('timeout') self.bid = cs_config.get('bid') self.ssh_keypair = cs_config.get('ssh_keypair') self.sleep_time = cs_config.get('sleep_time') self.shutdown_del_queue = shutdown_del_queue # Figure out request for resources: resources_needed = cs_config.get('resources_needed') self.resources_needed = DEFAULT_RESOURCES_NEEDED self.resources_needed.update(resources_needed) studio_resources = self.config.get('resources_needed') if studio_resources: self.resources_needed.update(studio_resources) # Figure out task queue and cloud we are going to use: queue_name = cs_config.get('queue') cloud_name = cs_config.get('cloud') if cs_config.get('local'): queue_name = None cloud_name = None elif queue_name is not None: self.shutdown_del_queue = False if cloud_name in ['ec2spot', 'ec2']: assert queue_name.startswith("sqs_") else: queue_name = self.experiment_id if cloud_name in ['ec2spot', 'ec2']: queue_name = "sqs_" + queue_name self.cloud = cloud_name if queue_name is not None and queue_name.startswith("rmq_"): assert self.cloud is None self.wm = runner.get_worker_manager(self.config, self.cloud) if queue_name is not None: self.logger.info( "CompletionService configured with queue {0}".format( queue_name)) self.queue = runner.get_queue(queue_name=queue_name, cloud=self.cloud, config=self.config, logger=self.logger, verbose=self.verbose_level) self.queue_name = self.queue.get_name() self.submitted = {} self.use_spot = cloud_name in ['ec2spot', 'gcspot'] self.logger.info("Project name: {0}".format(self.project_name)) self.logger.info("Initial/final queue name: {0}, {1}".format( queue_name, self.queue_name)) self.logger.info("Cloud name: {0}".format(self.cloud))
def __init__(self, name: str, receiver_keypath: str, sender_keypath: str = None): """ param: name - payload builder name param: receiver_keypath - file path to .pem file with recipient public key param: sender_keypath - file path to .pem file with sender private key """ super(EncryptedPayloadBuilder, self).__init__(name) # XXX Set logger verbosity level here self.logger = logs.getLogger(self.__class__.__name__) self.recipient_key_path = receiver_keypath self.recipient_key = None try: self.recipient_key =\ RSA.import_key(open(self.recipient_key_path).read()) except: check_for_kb_interrupt() msg = "FAILED to import recipient public key from: {0}"\ .format(self.recipient_key_path) self.logger.error(msg) raise ValueError(msg) self.sender_key_path = sender_keypath self.sender_key: SigningKey = None self.verify_key: VerifyKey = None self.sender_fingerprint = None if self.sender_key_path is None: self.logger.error( "Signing key path must be specified for encrypted payloads. ABORTING." ) raise ValueError() # We expect ed25519 signing key in "openssh private key" format try: public_key_data, private_key_data =\ Ed25519KeyUtil.parse_private_key_file( self.sender_key_path, self.logger) if public_key_data is None or private_key_data is None: self._raise_error( "Failed to import private signing key from {0}. ABORTING.". format(self.sender_key_path)) self.sender_key = SigningKey(private_key_data) self.verify_key = VerifyKey(public_key_data) except Exception: self._raise_error("FAILED to open/read private signing key file: {0}"\ .format(self.sender_key_path)) self.sender_fingerprint = \ self._get_fingerprint(public_key_data) self.simple_builder =\ UnencryptedPayloadBuilder("simple-builder-for-encryptor")
import time from studio import logs logger = logs.getLogger('helloworld') logger.setLevel(10) i = 0 while True: logger.info('{} seconds passed '.format(i)) time.sleep(1) i += 1
def test_baked_image(self): # create a docker image with baked in credentials # and run a remote worker tests with it logger = logs.getLogger('test_baked_image') logger.setLevel(logs.DEBUG) # check if docker is installed dockertestp = subprocess.Popen(['docker'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) dockertestout, _ = dockertestp.communicate() if dockertestout: logger.info("docker test output: \n" + str(dockertestout)) if dockertestp.returncode != 0: logger.error("docker is not installed (correctly)") return image = 'test_image' + str(uuid.uuid4()) addcredsp = subprocess.Popen( [ 'studio-add-credentials', '--tag=' + image, '--base-image=peterzhokhoff/studioml' ], # stdout=subprocess.PIPE, # stderr=subprocess.STDOUT ) addcredsout, _ = addcredsp.communicate() if addcredsout: logger.info('studio-add-credentials output: \n' + str(addcredsout)) if addcredsp.returncode != 0: logger.error("studio-add-credentials failed.") self.assertTrue(False) experiment_name = 'test_remote_worker_baked' + str(uuid.uuid4()) queue_name = experiment_name logger = logs.getLogger('test_baked_image') logger.setLevel(10) pw = subprocess.Popen( [ 'studio-start-remote-worker', '--queue=' + queue_name, '--no-cache', '--single-run', '--timeout=30', '--image=' + image ], # stdout=subprocess.PIPE, # stderr=subprocess.STDOUT ) stubtest_worker(self, experiment_name=experiment_name, runner_args=['--queue=' + queue_name, '--force-git'], config_name='test_config_http_client.yaml', test_script='tf_hello_world.py', script_args=['arg0'], expected_output='[ 2.0 6.0 ]', queue=PubsubQueue(queue_name)) workerout, _ = pw.communicate() if workerout: logger.debug("studio-start-remote-worker output: \n" + str(workerout)) rmip = subprocess.Popen(['docker', 'rmi', image], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) rmiout, _ = rmip.communicate() if rmiout: logger.info('docker rmi output: \n' + str(rmiout))