def run(self, experiment): if isinstance(experiment, basestring): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, model.Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in self.config['env'].iteritems(): if v is not None: env[str(k)] = str(v) fs_tracker.setup_experiment(env, experiment, clean=True) log_path = fs_tracker.get_artifact_cache('output', experiment.key) # log_path = os.path.join(model_dir, self.config['log']['name']) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: p = subprocess.Popen( ["python", experiment.filename] + experiment.args, stdout=output_file, stderr=subprocess.STDOUT, env=env, cwd=experiment.artifacts['workspace']['local']) # simple hack to show what's in the log file ptail = subprocess.Popen(["tail", "-f", log_path]) sched.add_job( lambda: db.checkpoint_experiment(experiment), 'interval', minutes=self.config['saveWorkspaceFrequencyMinutes']) def kill_if_stopped(): if db.get_experiment(experiment.key, getinfo=False).status == 'stopped': p.kill() sched.add_job(kill_if_stopped, 'interval', seconds=10) try: p.wait() finally: ptail.kill() db.finish_experiment(experiment) sched.shutdown()
def create_experiments(hyperparam_tuples): experiments = [] # experiment_names = {} for hyperparam_tuple in hyperparam_tuples: experiment_name = experiment_name_base experiment_name += "__opt__%s__%s" % (rand_string(32), int(time.time())) experiment_name = experiment_name.replace('.', '_') workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) current_artifacts = artifacts.copy() current_artifacts.update({ 'workspace': { 'local': workspace_new, 'mutable': True } }) rsync_cp(workspace_orig, workspace_new, ignore_arg, logger) # shutil.copytree(workspace_orig, workspace_new) for param_name, param_value in six.iteritems(hyperparam_tuple): if isinstance(param_value, np.ndarray): array_filepath = '/tmp/%s.npy' % rand_string(32) np.save(array_filepath, param_value) assert param_name not in current_artifacts current_artifacts[param_name] = {'local': array_filepath, 'mutable': False} else: with open(os.path.join(workspace_new, exec_filename), 'r') as f: script_text = f.read() script_text = re.sub( '\\b' + param_name + '\\b(?=[^=]*\\n)', str(param_value), script_text) with open(os.path.join(workspace_new, exec_filename), 'w') as f: f.write(script_text) experiments.append(create_experiment( filename=exec_filename, args=other_args, experiment_name=experiment_name, project=project, artifacts=current_artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, )) return experiments
def __init__(self, key, filename, args, pythonenv, project=None, artifacts=None, status='waiting', resources_needed=None, time_added=None, time_started=None, time_last_checkpoint=None, time_finished=None, info={}, git=None, metric=None): self.key = key self.filename = filename self.args = args if args else [] self.pythonenv = pythonenv self.project = project workspace_path = os.path.abspath('.') model_dir = fs_tracker.get_model_directory(key) self.artifacts = { 'workspace': { 'local': workspace_path, 'mutable': True }, 'modeldir': { 'local': model_dir, 'mutable': True }, 'output': { 'local': fs_tracker.get_artifact_cache('output', key), 'mutable': True }, 'tb': { 'local': fs_tracker.get_tensorboard_dir(key), 'mutable': True } } if artifacts is not None: self.artifacts.update(artifacts) self.resources_needed = resources_needed self.status = status self.time_added = time_added self.time_started = time_started self.time_last_checkpoint = time_last_checkpoint self.time_finished = time_finished self.info = info self.git = git self.metric = metric
def create_experiments(hyperparam_tuples): experiments = [] experiment_names = {} for hyperparam_tuple in hyperparam_tuples: experiment_name = experiment_name_base for param_name, param_value in hyperparam_tuple.iteritems(): experiment_name = experiment_name + '__' + \ param_name + '__' + str(param_value) experiment_name = experiment_name.replace('.', '_') # if experiments uses a previously used name, change it if experiment_name in experiment_names: new_experiment_name = experiment_name counter = 1 while new_experiment_name in experiment_names: counter += 1 new_experiment_name = "%s_v%s" % (experiment_name, counter) experiment_name = new_experiment_name experiment_names[experiment_name] = True workspace_orig = artifacts['workspace']['local'] \ if 'workspace' in artifacts.keys() else '.' workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) current_artifacts = artifacts.copy() current_artifacts.update({ 'workspace': { 'local': workspace_new, 'mutable': True } }) shutil.copytree(workspace_orig, workspace_new) with open(os.path.join(workspace_new, exec_filename), 'r') as f: script_text = f.read() for param_name, param_value in hyperparam_tuple.iteritems(): script_text = re.sub('\\b' + param_name + '\\b(?=[^=]*\\n)', str(param_value), script_text) with open(os.path.join(workspace_new, exec_filename), 'w') as f: f.write(script_text) experiments.append(model.create_experiment( filename=exec_filename, args=other_args, experiment_name=experiment_name, project=project, artifacts=current_artifacts, resources_needed=resources_needed, metric=runner_args.metric)) return experiments
def put_artifact(self, artifact, local_path=None, cache=True, background=False): if local_path is None: local_path = artifact['local'] key = artifact.get('key') if os.path.exists(local_path): tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) if os.path.isdir(local_path): local_basepath = local_path local_nameonly = '.' else: local_nameonly = os.path.basename(local_path) local_basepath = os.path.dirname(local_path) ignore_arg = '' ignore_filepath = os.path.join(local_basepath, ".studioml_ignore") if os.path.exists(ignore_filepath) and \ not os.path.isdir(ignore_filepath): ignore_arg = "--exclude-from=%s" % ignore_filepath # self.logger.debug('.studioml_ignore found: %s,' # ' files listed inside will' # ' not be tarred or uploaded' # % ignore_filepath) if cache and key: cache_dir = fs_tracker.get_artifact_cache(key) if cache_dir != local_path: debug_str = "Copying local path {} to cache {}" \ .format(local_path, cache_dir) if ignore_arg != '': debug_str += ", excluding files in {}" \ .format(ignore_filepath) self.logger.debug(debug_str) util.rsync_cp(local_path, cache_dir, ignore_arg, self.logger) debug_str = ("Tarring and uploading directrory. " + "tar_filename = {}, " + "local_path = {}, " + "key = {}").format(tar_filename, local_path, key) if ignore_arg != '': debug_str += ", exclude = {}".format(ignore_filepath) self.logger.debug(debug_str) tarcmd = 'tar {} -czf {} -C {} {}'.format(ignore_arg, tar_filename, local_basepath, local_nameonly) self.logger.debug("Tar cmd = {}".format(tarcmd)) tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) tarout, _ = tarp.communicate() if tarp.returncode != 0: self.logger.info('tar had a non-zero return code!') self.logger.info('tar output: \n ' + tarout) if key is None: key = 'blobstore/' + util.sha256_checksum(tar_filename) \ + '.tgz' def finish_upload(): self._upload_file(key, tar_filename) os.remove(tar_filename) t = Thread(target=finish_upload) t.start() if background: return (key, t) else: t.join() return key else: self.logger.debug(("Local path {} does not exist. " + "Not uploading anything.").format(local_path))
def get_artifact(self, artifact, local_path=None, only_newer=True, background=False): key = artifact['key'] if local_path is None: if 'local' in artifact.keys() and \ os.path.exists(artifact['local']): local_path = artifact['local'] else: if artifact['mutable']: local_path = fs_tracker.get_artifact_cache(key) else: local_path = fs_tracker.get_blob_cache(key) local_path = re.sub('\/\Z', '', local_path) local_basepath = os.path.dirname(local_path) self.logger.info( "Downloading dir {} to local path {} from storage...".format( key, local_path)) if only_newer and os.path.exists(local_path): self.logger.debug( 'Comparing date of the artifact in storage with local') storage_time = self._get_file_timestamp(key) local_time = os.path.getmtime(local_path) if storage_time is None: self.logger.info( "Unable to get storage timestamp, storage is either " + "corrupted and has not finished uploading") return local_path if local_time > storage_time - self.timestamp_shift: self.logger.info( "Local path is younger than stored, skipping the download") return local_path tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) self.logger.debug("tar_filename = {} ".format(tar_filename)) def finish_download(): self._download_file(key, tar_filename) if os.path.exists(tar_filename): # first, figure out if the tar file has a base path of . # or not self.logger.info("Untarring {}".format(tar_filename)) listtar, _ = subprocess.Popen( ['tar', '-tzf', tar_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() listtar = listtar.strip().split('\n') self.logger.info('List of files in the tar: ' + str(listtar)) if listtar[0].startswith('./'): # Files are archived into tar from .; adjust path # accordingly basepath = local_path else: basepath = local_basepath tarcmd = ('mkdir -p {} && ' + 'tar -xzf {} -C {} --keep-newer-files') \ .format(basepath, tar_filename, basepath) tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) tarout, tarerr = tarp.communicate() if tarp.returncode != 0: self.logger.info('tar had a non-zero return code!') self.logger.info('tar cmd = ' + tarcmd) self.logger.info('tar output: \n ' + tarout) if len(listtar) == 1: actual_path = os.path.join(basepath, listtar[0]) self.logger.info('Renaming {} into {}'.format( actual_path, local_path)) os.rename(actual_path, local_path) os.remove(tar_filename) else: self.logger.warn( 'file {} download failed'.format(tar_filename)) t = Thread(target=finish_download) t.start() if background: return (local_path, t) else: t.join() return local_path
def put_artifact(self, artifact, local_path=None, cache=True, background=False): if local_path is None: local_path = artifact['local'] key = artifact.get('key') if os.path.exists(local_path): tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) if os.path.isdir(local_path): local_basepath = local_path local_nameonly = '.' else: local_nameonly = os.path.basename(local_path) local_basepath = os.path.dirname(local_path) if cache and key: cache_dir = fs_tracker.get_artifact_cache(key) if cache_dir != local_path: self.logger.debug( "Copying local path {} to cache {}".format( local_path, cache_dir)) if os.path.exists(cache_dir) and os.path.isdir(cache_dir): shutil.rmtree(cache_dir) pcp = subprocess.Popen( ['cp', '-pR', local_path, cache_dir], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) cpout, _ = pcp.communicate() if pcp.returncode != 0: self.logger.info( 'cp returned non-zero exit code. Output:') self.logger.info(cpout) self.logger.debug( ("Tarring and uploading directrory. " + "tar_filename = {}, " + "local_path = {}, " + "key = {}").format( tar_filename, local_path, key)) tarcmd = 'tar -czf {} -C {} {}'.format(tar_filename, local_basepath, local_nameonly) self.logger.debug("Tar cmd = {}".format(tarcmd)) tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) tarout, _ = tarp.communicate() if tarp.returncode != 0: self.logger.info('tar had a non-zero return code!') self.logger.info('tar output: \n ' + tarout) if key is None: key = 'blobstore/' + util.sha256_checksum(tar_filename) \ + '.tgz' def finish_upload(): self._upload_file(key, tar_filename) os.remove(tar_filename) t = Thread(target=finish_upload) t.start() if background: return (key, t) else: t.join() return key else: self.logger.debug(("Local path {} does not exist. " + "Not uploading anything.").format(local_path))
def _tartifact(self, local_path, key, cache=True): tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) if os.path.isdir(local_path): local_basepath = local_path local_nameonly = '.' else: local_nameonly = os.path.basename(local_path) local_basepath = os.path.dirname(local_path) ignore_arg = '' ignore_filepath = os.path.join(local_basepath, ".studioml_ignore") if os.path.exists(ignore_filepath) and \ not os.path.isdir(ignore_filepath): ignore_arg = "--exclude-from=%s" % ignore_filepath self.logger.debug('.studioml_ignore found: %s,' ' files listed inside will' ' not be tarred or uploaded' % ignore_filepath) if cache and key: cache_dir = fs_tracker.get_artifact_cache(key) if cache_dir != local_path: debug_str = "Copying local path {} to cache {}" \ .format(local_path, cache_dir) if ignore_arg != '': debug_str += ", excluding files in {}" \ .format(ignore_filepath) self.logger.debug(debug_str) util.rsync_cp(local_path, cache_dir, ignore_arg, self.logger) debug_str = ("Tarring artifact. " + "tar_filename = {}, " + "local_path = {}, " + "key = {}").format( tar_filename, local_path, key) if ignore_arg != '': debug_str += ", exclude = {}".format(ignore_filepath) self.logger.debug(debug_str) tarcmd = 'tar {} {} -cf {} -C {} {}'.format( ignore_arg, compression_to_taropt(self.compression), tar_filename, local_basepath, local_nameonly) self.logger.debug("Tar cmd = {}".format(tarcmd)) tic = time.time() tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) tarout, _ = tarp.communicate() toc = time.time() if tarp.returncode != 0: self.logger.info('tar had a non-zero return code!') self.logger.info('tar output: \n ' + sixdecode(tarout)) self.logger.info('tar finished in {}s'.format(toc - tic)) return tar_filename
def get_artifact(self, artifact, local_path=None, only_newer=True, background=False): key = artifact.get('key') bucket = artifact.get('bucket') if key is None: assert not artifact['mutable'] assert artifact.get('url') is not None or \ artifact.get('qualified') is not None remote_path = artifact.get('url') if remote_path is None: remote_path = artifact.get('qualified') key = hashlib.sha256(remote_path.encode()).hexdigest() local_path = fs_tracker.get_blob_cache(key) if os.path.exists(local_path): self.logger.info( ('Immutable artifact exists at local_path {},' + ' skipping the download').format(local_path)) return local_path if artifact.get('url') is not None: download_file(remote_path, local_path, self.logger) else: if remote_path.startswith('dockerhub://') or \ remote_path.startswith('shub://'): self.logger.info( ('Qualified {} points to a shub or dockerhub,' + ' skipping the download')) return remote_path download_file_from_qualified(remote_path, local_path, self.logger) self.logger.debug( 'Downloaded file {} from external source {}'.format( local_path, remote_path)) return local_path if local_path is None: if 'local' in artifact.keys() and \ os.path.exists(artifact['local']): local_path = artifact['local'] else: if artifact['mutable']: local_path = fs_tracker.get_artifact_cache(key) else: local_path = fs_tracker.get_blob_cache(key) if os.path.exists(local_path): self.logger.info( ('Immutable artifact exists at local_path {},' + ' skipping the download').format(local_path)) return local_path local_path = re.sub('\/\Z', '', local_path) local_basepath = os.path.dirname(local_path) self.logger.info( "Downloading dir {} to local path {} from storage...".format( key, local_path)) if only_newer and os.path.exists(local_path): self.logger.debug( 'Comparing date of the artifact in storage with local') storage_time = self._get_file_timestamp(key) local_time = os.path.getmtime(local_path) if storage_time is None: self.logger.info( "Unable to get storage timestamp, storage is either " + "corrupted or has not finished uploading") return local_path if local_time > storage_time - self.timestamp_shift: self.logger.info( "Local path is younger than stored, skipping the download") return local_path tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) self.logger.debug("tar_filename = {} ".format(tar_filename)) def finish_download(): try: self._download_file(key, tar_filename) except BaseException as e: self.logger.debug(e) if os.path.exists(tar_filename): # first, figure out if the tar file has a base path of . # or not self.logger.info("Untarring {}".format(tar_filename)) listtar, _ = subprocess.Popen(['tar', '-tf', tar_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True).communicate() listtar = listtar.strip().split(b'\n') listtar = [s.decode('utf-8') for s in listtar] self.logger.info('List of files in the tar: ' + str(listtar)) if listtar[0].startswith('./'): # Files are archived into tar from .; adjust path # accordingly basepath = local_path else: basepath = local_basepath tarcmd = ('mkdir -p {} && ' + 'tar -xf {} -C {} --keep-newer-files') \ .format(basepath, tar_filename, basepath) self.logger.debug('Tar cmd = {}'.format(tarcmd)) tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) tarout, tarerr = tarp.communicate() if tarp.returncode != 0: self.logger.info('tar had a non-zero return code!') self.logger.info('tar cmd = ' + tarcmd) self.logger.info('tar output: \n ' + str(tarout)) if len(listtar) == 1: actual_path = os.path.join(basepath, listtar[0]) self.logger.info('Renaming {} into {}'.format( actual_path, local_path)) retry(lambda: os.rename(actual_path, local_path), no_retries=5, sleep_time=1, exception_class=OSError, logger=self.logger) os.remove(tar_filename) else: self.logger.warning( 'file {} download failed'.format(tar_filename)) if background: t = Thread(target=finish_download) t.start() return (local_path, t) else: finish_download() return local_path
def __init__(self, key, filename, args, pythonenv, project=None, artifacts=None, status='waiting', resources_needed=None, time_added=None, time_started=None, time_last_checkpoint=None, time_finished=None, info={}, git=None, metric=None, pythonver=None, max_duration=None): self.key = key self.args = [] self.filename = filename if filename and '::' in filename: self.filename = '-m' module_name = filename.replace('::', '.') if module_name.startswith('.'): module_name = module_name[1:] self.args.append(module_name) if args: self.args += args self.args = [shquote(a) for a in self.args] self.pythonenv = pythonenv self.project = project self.pythonver = pythonver if pythonver else sys.version_info[0] workspace_path = os.path.abspath('.') try: model_dir = fs_tracker.get_model_directory(key) except BaseException: model_dir = None self.artifacts = { 'workspace': { 'local': workspace_path, 'mutable': False, 'unpack': True }, 'modeldir': { 'local': model_dir, 'mutable': True, 'unpack': True }, 'output': { 'local': fs_tracker.get_artifact_cache('output', key), 'mutable': True, 'unpack': True }, 'tb': { 'local': fs_tracker.get_tensorboard_dir(key), 'mutable': True, 'unpack': True }, '_metrics': { 'local': fs_tracker.get_artifact_cache('_metrics', key), 'mutable': True, 'unpack': True } } if artifacts is not None: self.artifacts.update(artifacts) self.resources_needed = resources_needed self.status = status self.time_added = time_added self.time_started = time_started self.time_last_checkpoint = time_last_checkpoint self.time_finished = time_finished self.info = info self.git = git self.metric = metric self.max_duration = max_duration
def run(self, experiment): if isinstance(experiment, six.string_types): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in six.iteritems(self.config['env']): if v is not None: env[str(k)] = str(v) env['PYTHONUNBUFFERED'] = 'TRUE' fs_tracker.setup_experiment(env, experiment, clean=False) log_path = fs_tracker.get_artifact_cache('output', experiment.key) # log_path = os.path.join(model_dir, self.config['log']['name']) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: python = 'python' if experiment.pythonver == 3: python = 'python3' cmd = [python, experiment.filename] + experiment.args cwd = experiment.artifacts['workspace']['local'] container_artifact = experiment.artifacts.get('_singularity') if container_artifact: container = container_artifact.get('local') if not container: container = container_artifact.get('qualified') cwd = fs_tracker.get_artifact_cache( 'workspace', experiment.key) for tag, art in six.iteritems(experiment.artifacts): local_path = art.get('local') if not art['mutable'] and os.path.exists(local_path): os.symlink(art['local'], os.path.join(os.path.dirname(cwd), tag)) if experiment.filename is not None: cmd = [ 'singularity', 'exec', container, ] + cmd else: cmd = ['singularity', 'run', container] self.logger.info('Running cmd: \n {} '.format(cmd)) p = subprocess.Popen(cmd, stdout=output_file, stderr=subprocess.STDOUT, env=env, cwd=cwd) # simple hack to show what's in the log file # ptail = subprocess.Popen(["tail", "-f", log_path]) logtail = Pygtail(log_path) def tail_func(): while logtail: for line in logtail: print(line) time.sleep(0.1) tail_thread = threading.Thread(target=tail_func) tail_thread.start() minutes = 0 if self.config.get('saveWorkspaceFrequency'): minutes = int( str2duration(self.config['saveWorkspaceFrequency']). total_seconds() / 60) def checkpoint(): try: db.checkpoint_experiment(experiment) except BaseException as e: self.logger.info(e) sched.add_job(checkpoint, 'interval', minutes=minutes) metrics_path = fs_tracker.get_artifact_cache( '_metrics', experiment.key) minutes = 0 if self.config.get('saveMetricsFrequency'): minutes = int( str2duration(self.config['saveMetricsFrequency']). total_seconds() / 60) sched.add_job(lambda: save_metrics(metrics_path), 'interval', minutes=minutes) def kill_if_stopped(): if db.get_experiment(experiment.key, getinfo=False).status == 'stopped': p.kill() if experiment.max_duration is not None and \ time.time() > experiment.time_started + \ int(str2duration(experiment.max_duration) .total_seconds()): p.kill() sched.add_job(kill_if_stopped, 'interval', seconds=10) try: p.wait() finally: save_metrics(metrics_path) sched.shutdown() logtail = None db.checkpoint_experiment(experiment) db.finish_experiment(experiment) return p.returncode