def run_main(asset_path): loader = runsenabler_loader.RunsEnablerLoader("some_dir") plugins = default.get_plugins() + [ paramplot_plugin.ParamPlotPlugin, loader ] gr_tensorboard = TensorBoard(plugins, lambda: open(asset_path, 'rb')) gr_tensorboard.configure(sys.argv) use_filesystem_controller = gr_tensorboard.flags.use_filesystem_controller original_logdir = pathlib.Path(gr_tensorboard.flags.logdir) loader.actual_logdir = str(original_logdir) if use_filesystem_controller: # Retrieve the actual log directory and replace it in the context with the new logdir parent_dir = original_logdir.parent print("logdir provided: " + str(original_logdir)) new_logdir = parent_dir / "temp_dir" print("creating temporary workspace in " + str(new_logdir)) # Create the temp dir new_logdir.mkdir(parents=True) # swap the original logdir for the new one gr_tensorboard.flags.logdir = str(new_logdir) try: sys.exit(gr_tensorboard.main()) finally: if use_filesystem_controller: shutil.rmtree(str(new_logdir))
def run_main(asset_path): plugins = default.get_plugins() + [ paramplot_plugin.ParamPlotPlugin, runsenabler_loader.RunsEnablerLoader() ] gr_tensorboard = TensorBoard(plugins, lambda: open(asset_path, 'rb')) gr_tensorboard.configure(sys.argv) sys.exit(gr_tensorboard.main())
def launch_tensorboard(log_path: str) -> str: """ Launch tensorboard at given log path. :param log_path: log path :return: tensorboard url """ tb = TensorBoard() tb.configure((None, "--logdir", log_path)) url = tb.launch() return url
def initialize(log_root_path: Optional[str] = None, log_name: Optional[str] = None, verbose: bool = True) -> SummaryWriter: """Initialize Tensorboard daemon. .. note:: It will be used later for monitoring the learning progress. :param log_root_path: Fullpath of root log directory. Optional: location of this file / log by default. :param log_name: Name of the subdirectory where to save data. Optional: full date _ hostname by default. :param verbose: Whether or not to print information about what is going on. Optional: True by default. :returns: SummaryWriter to pass to the training agent to monitor the training progress. """ # Configure Tensorboard if log_root_path is None: log_root_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "log") if 'tb' not in locals().keys(): tb = TensorBoard() tb.configure(host="0.0.0.0", logdir=log_root_path) url = tb.launch() if verbose: print(f"Started Tensorboard {url}. " f"Root directory: {log_root_path}") # Create log directory if log_name is None: log_name = "_".join((datetime.now().strftime("%Y_%m_%d_%H_%M_%S"), socket.gethostname().replace('-', '_'))) log_path = os.path.join(log_root_path, log_name) pathlib.Path(log_path).mkdir(parents=True, exist_ok=True) if verbose: print(f"Tensorboard logfiles directory: {log_path}") return SummaryWriter(log_path)
ray.init( address=None, # The address of the Ray cluster to connect to, if any. num_cpus=8, # Number of CPUs assigned to each raylet (None = no limit) num_gpus=1, # Number of GPUs assigned to each raylet (None = no limit) webui_host="0.0.0.0", # The host to bind the web UI server to. local_mode= False, # If true, the code will be executed serially (for debugging purpose) logging_level=20 # Logging level. ) # # Create tensorboard Jupyter cell # %load_ext tensorboard # %tensorboard --logdir logs if not 'tb' in locals().keys(): tb = TensorBoard() tb.configure(host="0.0.0.0", logdir=os.path.join(pathlib.Path.home(), 'ray_results')) url = tb.launch() print(f"Starting Tensorboard {url} ...") # ================= Run hyperparameter search ================= # Register the custom model architecture (it implements 'vf_share_layers') ModelCatalog.register_custom_model("my_model", FullyConnectedNetwork) # Register the environment with custom default constructor arguments env_creator = lambda env_config: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS) register_env("my_custom_env", env_creator)
best_reward = result['rew'] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, ' f'best_reward: {best_reward:.6f} in #{best_epoch}') if stop_fn and stop_fn(best_reward): break return gather_info( start_time, train_collector, test_collector, best_reward) ### Configure Tensorboard data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'log') if not 'tb' in locals().keys(): tb = TensorBoard() tb.configure(host="0.0.0.0", logdir=data_path) url = tb.launch() print(f"Started Tensorboard {url} at {data_path}...") writer = SummaryWriter(data_path) ### Configure export def save_fn(policy): torch.save(policy.state_dict(), os.path.join(data_path, 'policy.pth')) ### Configure early stopping of training def stop_fn(x): return x >= TARGET_EPISODE_STEPS ### Run the learning process result = onpolicy_trainer(
def initialize(num_cpus: int, num_gpus: int, log_root_path: str, log_name: Optional[str] = None, logger_cls: type = TBXLogger, launch_tensorboard: bool = True, debug: bool = False, verbose: bool = True) -> Callable[[Dict[str, Any]], Logger]: """Initialize Ray and Tensorboard daemons. It will be used later for almost everything from dashboard, remote/client management, to multithreaded environment. .. note: The default Tensorboard port will be used, namely 6006 if available, using 0.0.0.0 (binding to all IPv4 addresses on local machine). Similarly, Ray dashboard port is 8265 if available. In both cases, the port will be increased interatively until to find one available. :param num_cpus: Maximum number of CPU threads that can be executed in parallel. Note that it does not actually reserve part of the CPU, so that several processes can reserve the number of threads available on the system at the same time. :param num_gpu: Maximum number of GPU unit that can be used, which can be fractional to only allocate part of the resource. Note that contrary to CPU resource, the memory is likely to actually be reserve and allocated by the process, in particular using Tensorflow backend. :param log_root_path: Fullpath of root log directory. :param log_name: Name of the subdirectory where to save data. `None` to use default name, empty string '' to set it interactively in command prompt. It must be a valid Python identifier. Optional: full date _ hostname by default. :param logger_cls: Custom logger class type deriving from `TBXLogger`. Optional: `TBXLogger` by default. :param launch_tensorboard: Whether or not to launch tensorboard automatically. Optional: Enabled by default. :param debug: Whether or not to display debugging trace. Optional: Disabled by default. :param verbose: Whether or not to print information about what is going on. Optional: True by default. :returns: lambda function to pass a `ray.Trainer` to monitor learning progress in Tensorboard. """ # Make sure provided logger class derives from ray.tune.logger.Logger assert issubclass(logger_cls, Logger), ( "Logger class must derive from `ray.tune.logger.Logger`") # Check if cluster servers are already running, and if requested resources # are available. is_cluster_running = False redis_addresses = services.find_redis_address() if redis_addresses: for redis_address in redis_addresses: # Connect to redis global state accessor global_state_accessor = GlobalStateAccessor( redis_address, ray_constants.REDIS_DEFAULT_PASSWORD) global_state_accessor.connect() # Get available resources resources: Dict[str, int] = defaultdict(int) for info in global_state_accessor.get_all_available_resources(): # pylint: disable=no-member message = ray.gcs_utils.AvailableResources.FromString(info) for field, capacity in message.resources_available.items(): resources[field] += capacity # Disconnect global state accessor time.sleep(0.1) global_state_accessor.disconnect() # Check if enough computation resources are available is_cluster_running = (resources["CPU"] >= num_cpus and resources["GPU"] >= num_gpus) # Stop looking as soon as a cluster with enough resources is found if is_cluster_running: break # Connect to Ray server if necessary, starting one if not already running if not ray.is_initialized(): if not is_cluster_running: # Start new Ray server, if not already running ray.init( # Address of Ray cluster to connect to, if any address=None, # Number of CPUs assigned to each raylet num_cpus=num_cpus, # Number of GPUs assigned to each raylet num_gpus=num_gpus, # Enable object eviction in LRU order under memory pressure _lru_evict=False, # Whether or not to execute the code serially (for debugging) local_mode=debug, # Logging level logging_level=logging.DEBUG if debug else logging.ERROR, # Whether to redirect outputs from every worker to the driver log_to_driver=debug, # Whether to start Ray dashboard, to monitor cluster's status include_dashboard=True, # The host to bind the dashboard server to dashboard_host="0.0.0.0") else: # Connect to existing Ray cluster ray.init( address="auto", _lru_evict=False, local_mode=debug, logging_level=logging.DEBUG if debug else logging.ERROR, log_to_driver=debug, include_dashboard=False) # Configure Tensorboard if launch_tensorboard: tb = TensorBoard() tb.configure(host="0.0.0.0", logdir=os.path.abspath(log_root_path)) url = tb.launch() if verbose: print(f"Started Tensorboard {url}.", f"Root directory: {log_root_path}") # Define log filename interactively if requested if log_name == "": while True: log_name = input( "Enter desired log subdirectory name (empty for default)...") if not log_name or re.match(r'^[A-Za-z0-9_]+$', log_name): break print("Unvalid name. Only Python identifiers are supported.") # Handling of default log name and sanity checks if not log_name: log_name = "_".join(( datetime.now().strftime("%Y_%m_%d_%H_%M_%S"), re.sub(r'[^A-Za-z0-9_]', "_", socket.gethostname()))) else: assert re.match(r'^[A-Za-z0-9_]+$', log_name), ( "Log name must be a valid Python identifier.") # Create log directory log_path = os.path.join(log_root_path, log_name) pathlib.Path(log_path).mkdir(parents=True, exist_ok=True) if verbose: print(f"Tensorboard logfiles directory: {log_path}") # Define Ray logger def logger_creator(config: Dict[str, Any]) -> Logger: return logger_cls(config, log_path) return logger_creator