def __init__(self, env_name: str, worker_id: int, global_model: ActorCriticNetwork, seed: int, T: Value, lr: float = 1e-4, n_steps: int = 0, t_max: int = 100000, gamma: float = .99, tau: float = 1, beta: float = .01, value_loss_coef: float = .5, optimizer: Optimizer = None, is_train: bool = True, use_gae: bool = True, is_discrete: bool = False) -> None: """ Initialize Worker thread for A3C algorithm :param use_gae: use Generalize Advantage Estimate :param t_max: maximum episodes for training :param env_name: gym environment name :param worker_id: number of workers :param T: global shared counter :param optimizer: torch optimizer instance, either shared Optimizer or None for individual :param beta: entropy weight factor :param tau: TODO hyperparam for GAE :param gamma: discount factor :param global_model: shared global model to get the parameters from :param seed: seed to ensure reproducibility :param lr: learning rate for the workers NN :param n_steps: amount of steps for training :param value_loss_coef: factor for scaling the value loss """ super(Worker, self).__init__() self.is_discrete = is_discrete # separate env for each worker self.env_name = env_name # check if the requested environment is a quanser robot env if self.env_name in ['CartpoleStabShort-v0']: self.env = quanser_robots.GentlyTerminating(gym.make(self.env_name)) else: # use the official gym env as default self.env = gym.make(self.env_name) # training params self.n_steps = n_steps self.tau = tau self.gamma = gamma self.beta = beta self.value_loss_coef = value_loss_coef self.use_gae = use_gae # training and testing params self.seed = seed self.lr = lr self.t_max = t_max self.is_train = is_train # shared params self.optimizer = optimizer self.global_model = global_model self.worker_id = worker_id self.T = T # logging instance self.logger = logging.getLogger(__name__)
def run(self): torch.manual_seed(self.seed) env = quanser_robots.GentlyTerminating(gym.make(self.env_name)) # env = gym.make(self.env_name) global_model = ActorCriticNetwork(env.observation_space.shape[0], env.action_space, self.is_discrete) global_model.share_memory() # TODO optimizer = SharedRMSProp(global_model.parameters(), lr=self.lr) optimizer.share_memory() # start the test worker which is visualized to see how the current progress is #w = Worker(env_name=self.env_name, worker_id=self.n_worker, global_model=global_model, T=self.T, seed=self.seed, # lr=self.lr, t_max=200, optimizer=None, is_train=False, is_discrete=self.is_discrete) w = Worker(env_name=self.env_name, worker_id=self.n_worker, global_model=global_model, T=self.T, seed=self.seed, lr=self.lr, n_steps=0, t_max=200, gamma=.99, tau=1, beta=.01, value_loss_coef=.5, optimizer=None, is_train=False, is_discrete=self.is_discrete) w.start() self.worker_pool.append(w) # start all training workers which update the model parameters for wid in range(0, self.n_worker): self.logger.info("Worker {} created".format(wid)) w = Worker(env_name=self.env_name, worker_id=wid, global_model=global_model, T=self.T, seed=self.seed, lr=self.lr, n_steps=20, t_max=1000, gamma=.99, tau=1, beta=.01, value_loss_coef=.5, optimizer=None, is_train=True, is_discrete=self.is_discrete) w.start() self.worker_pool.append(w) for w in self.worker_pool: w.join()
def get_env(env_name, monitor=False): if 'RR' in env_name: env = quanser_robots.GentlyTerminating(gym.make(env_name)) else: if monitor: env = Monitor(gym.make(env_name), 'experiments/100_test_runs', video_callable=lambda count: count % 100 == 0, force=True) else: # use the official gym env as default env = gym.make(env_name) return env
def __init__(self, env_name, seed, n_features): # general self.env_name = env_name self.seed = seed self.n_features = n_features self.noise_var = None # env self.env = quanser_robots.GentlyTerminating(gym.make(self.env_name)) self.env = gym.make(self.env_name) self.env.seed(self.seed) # dynamics model # TODO learn length scale by evidence maximization self.mgp = MGPR(dim=self.env.observation_space.shape[0]) self.states = [] self.actions = []
def run(self): """ Start A3C worker and test thread :return: """ torch.manual_seed(self.args.seed) if "RR" in self.args.env_name: env = quanser_robots.GentlyTerminating(gym.make( self.args.env_name)) else: env = gym.make(self.args.env_name) optimizer = None critic_optimizer = None model_critic = None if self.args.shared_model: model = get_model(env=env, shared=self.args.shared_model, path=self.args.path, T=self.T, global_reward=self.global_reward) if not self.args.no_shared_optimizer: optimizer = get_shared_optimizer( model=model, optimizer_name=self.args.optimizer, lr=self.args.lr, path=self.args.path) else: model, model_critic = get_model(env=env, shared=self.args.shared_model, path=self.args.path, T=self.T, global_reward=self.global_reward) if not self.args.no_shared_optimizer: optimizer, critic_optimizer = get_shared_optimizer( model=model, optimizer_name=self.args.optimizer, lr=self.args.lr, path=self.args.path, model_critic=model_critic, optimizer_name_critic=self.args.optimizer, lr_critic=self.args.lr_critic) lr_scheduler = None lr_scheduler_critic = None if not self.args.no_shared_optimizer and self.args.lr_scheduler == "exponential": lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) if critic_optimizer: lr_scheduler_critic = torch.optim.lr_scheduler.ExponentialLR( critic_optimizer, gamma=0.99) p = Process(target=test, args=(self.args, self.args.worker, model, self.T, self.global_reward, optimizer, model_critic, critic_optimizer)) p.start() self.worker_pool.append(p) if not self.args.test: for wid in range(0, self.args.worker): p = Process(target=train, args=(self.args, wid, model, self.T, self.global_reward, optimizer, model_critic, critic_optimizer, lr_scheduler, lr_scheduler_critic)) p.start() self.worker_pool.append(p) time.sleep(1) for p in self.worker_pool: p.join()
def test(args, worker_id: int, global_model: torch.nn.Module, T: Value, global_reward: Value = None, optimizer: torch.optim.Optimizer = None, global_model_critic: CriticNetwork = None, optimizer_critic: torch.optim.Optimizer = None): """ Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py :param args: console arguments :param worker_id: id of worker to differentiatethem and init different seeds :param global_model: global model, which is optimized/ for split models: actor :param T: global counter of steps :param global_reward: global running reward value :param optimizer: optimizer for shared model/ for split models: actor model :param global_model_critic: optional global critic model for split networks :param optimizer_critic: optional critic optimizer for split networks :return: None """ logging.info("test worker started.") torch.manual_seed(args.seed + worker_id) if "RR" in args.env_name: env = quanser_robots.GentlyTerminating(gym.make(args.env_name)) else: if args.monitor: env = Monitor(gym.make(args.env_name), '100_test_runs', video_callable=lambda count: count % 100 == 0, force=True) else: env = gym.make(args.env_name) env.seed(args.seed + worker_id) normalizer = get_normalizer(args.normalizer, env) # get an instance of the current global model state model = copy.deepcopy(global_model) model.eval() model_critic = None if global_model_critic: model_critic = copy.deepcopy(global_model_critic) model_critic.eval() state = torch.from_numpy(env.reset()) writer = SummaryWriter(comment='_test', log_dir='experiments/runs/') start_time = time.time() t = 0 episode_reward = 0 done = False global_iter = 0 best_global_reward = -np.inf best_test_reward = -np.inf while True: # Get params from shared global model model.load_state_dict(global_model.state_dict()) if not args.shared_model: model_critic.load_state_dict(global_model_critic.state_dict()) rewards = [] eps_len = [] sleep = True # make 10 runs to get current avg performance for i in range(args.test_runs): while not done: t += 1 if not args.no_render: if i == 0 and t % 1 == 0 and "RR" not in args.env_name: env.render() if args.monitor and sleep: # add a small delay to do a screen capture of the test run if needed time.sleep(1) sleep = False # apply min/max scaling on the environment with torch.no_grad(): # select mean of normal dist as action --> Expectation if args.shared_model: _, mu, _ = model(normalizer(state)) else: mu, _ = model(normalizer(state)) action = mu.detach() state, reward, done, _ = env.step( np.clip(action.numpy(), -args.max_action, args.max_action)) done = done or t >= args.max_episode_length episode_reward += reward if done: # reset current cumulated reward and episode counter as well as env rewards.append(episode_reward) episode_reward = 0 eps_len.append(t) t = 0 state = env.reset() state = torch.from_numpy(state) # necessary to make more than one run done = False time_print = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) std_reward = np.std(rewards) rewards = np.mean(rewards) new_best = rewards > best_test_reward writer.add_scalar("reward/test", rewards, int(T.value)) writer.add_scalar("episode/length", np.mean(eps_len), int(T.value)) log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \ f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \ f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}" if new_best: # highlight messages if progress was done logging.info(log_string) best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward best_test_reward = rewards if rewards > best_test_reward else best_test_reward model_type = 'shared' if args.shared_model else 'split' save_checkpoint( { 'epoch': T.value, 'model': model.state_dict(), 'model_critic': model_critic.state_dict() if model_critic is not None else None, 'global_reward': global_reward.value, # only save optimizers if shared ones are used 'optimizer': optimizer.state_dict() if optimizer else None, 'optimizer_critic': optimizer_critic.state_dict() if optimizer_critic else None, }, path= f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar" ) else: # use by default only debug messages if no progress was reached logging.debug(log_string) global_iter += 1 # run evaluation only once in test mode if args.test: break