def init(job_type=None, dir=None, config=None, project=None, entity=None, reinit=None, tags=None, group=None, allow_val_change=False, resume=False, force=False, tensorboard=False, sync_tensorboard=False, name=None, notes=None, id=None, magic=None): """Initialize W&B If called from within Jupyter, initializes a new run and waits for a call to `wandb.log` to begin pushing metrics. Otherwise, spawns a new process to communicate with W&B. Args: job_type (str, optional): The type of job running, defaults to 'train' config (dict, argparse, or tf.FLAGS, optional): The hyper parameters to store with the run project (str, optional): The project to push metrics to entity (str, optional): The entity to push metrics to dir (str, optional): An absolute path to a directory where metadata will be stored group (str, optional): A unique string shared by all runs in a given group tags (list, optional): A list of tags to apply to the run id (str, optional): A globally unique (per project) identifier for the run name (str, optional): A display name which does not have to be unique notes (str, optional): A multiline string associated with the run reinit (bool, optional): Allow multiple calls to init in the same process resume (bool, str, optional): Automatically resume this run if run from the same machine, you can also pass a unique run_id sync_tensorboard (bool, optional): Synchronize wandb logs to tensorboard or tensorboardX force (bool, optional): Force authentication with wandb, defaults to False magic (bool, dict, or str, optional): magic configuration as bool, dict, json string, yaml filename Returns: A wandb.run object for metric and config logging. """ trigger.call('on_init', **locals()) global run global __stage_dir__ # We allow re-initialization when we're in Jupyter or explicity opt-in to it. in_jupyter = _get_python_type() != "python" if reinit or (in_jupyter and reinit != False): reset_env(exclude=env.immutable_keys()) run = None # TODO: deprecate tensorboard if tensorboard or sync_tensorboard and len(patched["tensorboard"]) == 0: util.get_module("wandb.tensorboard").patch() sagemaker_config = util.parse_sm_config() tf_config = util.parse_tfjob_config() if group == None: group = os.getenv(env.RUN_GROUP) if job_type == None: job_type = os.getenv(env.JOB_TYPE) if sagemaker_config: # Set run_id and potentially grouping if we're in SageMaker run_id = os.getenv('TRAINING_JOB_NAME') if run_id: os.environ[env.RUN_ID] = '-'.join([ run_id, os.getenv('CURRENT_HOST', socket.gethostname())]) conf = json.load( open("/opt/ml/input/config/resourceconfig.json")) if group == None and len(conf["hosts"]) > 1: group = os.getenv('TRAINING_JOB_NAME') # Set secret variables if os.path.exists("secrets.env"): for line in open("secrets.env", "r"): key, val = line.strip().split('=', 1) os.environ[key] = val elif tf_config: cluster = tf_config.get('cluster') job_name = tf_config.get('task', {}).get('type') task_index = tf_config.get('task', {}).get('index') if job_name is not None and task_index is not None: # TODO: set run_id for resuming? run_id = cluster[job_name][task_index].rsplit(":")[0] if job_type == None: job_type = job_name if group == None and len(cluster.get("worker", [])) > 0: group = cluster[job_name][0].rsplit("-"+job_name, 1)[0] image = util.image_id_from_k8s() if image: os.environ[env.DOCKER] = image if project: os.environ[env.PROJECT] = project if entity: os.environ[env.ENTITY] = entity if group: os.environ[env.RUN_GROUP] = group if job_type: os.environ[env.JOB_TYPE] = job_type if tags: os.environ[env.TAGS] = ",".join(tags) if id: os.environ[env.RUN_ID] = id if name is None: # We do this because of https://github.com/wandb/core/issues/2170 # to ensure that the run's name is explicitly set to match its # id. If we don't do this and the id is eight characters long, the # backend will set the name to a generated human-friendly value. # # In any case, if the user is explicitly setting `id` but not # `name`, their id is probably a meaningful string that we can # use to label the run. name = os.environ.get(env.NAME, id) # environment variable takes precedence over this. if name: os.environ[env.NAME] = name if notes: os.environ[env.NOTES] = notes if magic is not None and magic is not False: if isinstance(magic, dict): os.environ[env.MAGIC] = json.dumps(magic) elif isinstance(magic, str): os.environ[env.MAGIC] = magic elif isinstance(magic, bool): pass else: termwarn("wandb.init called with invalid magic parameter type", repeat=False) from wandb import magic_impl magic_impl.magic_install() if dir: os.environ[env.DIR] = dir util.mkdir_exists_ok(wandb_dir()) resume_path = os.path.join(wandb_dir(), wandb_run.RESUME_FNAME) if resume == True: os.environ[env.RESUME] = "auto" elif resume: os.environ[env.RESUME] = os.environ.get(env.RESUME, "allow") # TODO: remove allowing resume as a string in the future os.environ[env.RUN_ID] = id or resume elif os.path.exists(resume_path): os.remove(resume_path) if os.environ.get(env.RESUME) == 'auto' and os.path.exists(resume_path): if not os.environ.get(env.RUN_ID): os.environ[env.RUN_ID] = json.load(open(resume_path))["run_id"] # the following line is useful to ensure that no W&B logging happens in the user # process that might interfere with what they do # logging.basicConfig(format='user process %(asctime)s - %(name)s - %(levelname)s - %(message)s') # If a thread calls wandb.init() it will get the same Run object as # the parent. If a child process with distinct memory space calls # wandb.init(), it won't get an error, but it will get a result of # None. # This check ensures that a child process can safely call wandb.init() # after a parent has (only the parent will create the Run object). # This doesn't protect against the case where the parent doesn't call # wandb.init but two children do. if run or os.getenv(env.INITED): return run if __stage_dir__ is None: __stage_dir__ = "wandb" util.mkdir_exists_ok(wandb_dir()) try: signal.signal(signal.SIGQUIT, _debugger) except AttributeError: pass try: run = wandb_run.Run.from_environment_or_defaults() except IOError as e: termerror('Failed to create run directory: {}'.format(e)) raise LaunchError("Could not write to filesystem.") run.set_environment() def set_global_config(run): global config # because we already have a local config config = run.config set_global_config(run) global summary summary = run.summary # set this immediately after setting the run and the config. if there is an # exception after this it'll probably break the user script anyway os.environ[env.INITED] = '1' # we do these checks after setting the run and the config because users scripts # may depend on those things if sys.platform == 'win32' and run.mode != 'clirun': termerror( 'To use wandb on Windows, you need to run the command "wandb run python <your_train_script>.py"') return run if in_jupyter: _init_jupyter(run) elif run.mode == 'clirun': pass elif run.mode == 'run': api = InternalApi() # let init_jupyter handle this itself if not in_jupyter and not api.api_key: termlog( "W&B is a tool that helps track and visualize machine learning experiments") if force: termerror( "No credentials found. Run \"wandb login\" or \"wandb off\" to disable wandb") else: if run.check_anonymous(): _init_headless(run) else: termlog( "No credentials found. Run \"wandb login\" to visualize your metrics") run.mode = "dryrun" _init_headless(run, False) else: _init_headless(run) elif run.mode == 'dryrun': termlog( 'Dry run mode, not syncing to the cloud.') _init_headless(run, False) else: termerror( 'Invalid run mode "%s". Please unset WANDB_MODE.' % run.mode) raise LaunchError("The WANDB_MODE environment variable is invalid.") # set the run directory in the config so it actually gets persisted run.config.set_run_dir(run.dir) if sagemaker_config: run.config.update(sagemaker_config) allow_val_change = True if config: run.config.update(config, allow_val_change=allow_val_change) # Access history to ensure resumed is set when resuming run.history # Load the summary to support resuming run.summary.load() atexit.register(run.close_files) return run
from wandb import magic_impl magic_impl.magic_install()