class A3CAgent(object): def __init__(self): self.num_state = OBS_SPACE # observation size self.num_actions = NUM_ACTIONS # number of actions self.lr = tf.Variable(3e-4) # variable used for decaying learning rate self.starter_lr = 3e-4 # start value of learning rate # optimizer that trains the global network with the gradients of the locals # use locking because multiple threads self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True) # the global Actor-Critic network self.global_network = Actor_Critic(self.num_actions) # prepare the global network - used to construct the network on eager execution self.global_network( tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32)) self.discount_rate = 0.99 def start_threads(self): # max number of episodes max_eps = 1e6 envs = [] # create 1 local enviroment for each thread for _ in range(NUM_THREADS): _env = gym_super_mario_bros.make(env_name) _env = JoypadSpace(_env, SIMPLE_MOVEMENT) env = atari_wrapper.wrap_dqn(_env) envs.append(env) # create the threads and assign them their enviroment and exploration rate threads = [] for i in range(NUM_THREADS): thread = threading.Thread( target=train_thread, daemon=True, args=(self, max_eps, envs[i], agent.discount_rate, self.optimizer, stats, AnnealingVariable(.7, 1e-20, 10000), i)) threads.append(thread) # starts the threads for t in threads: print("STARTING") t.start() time.sleep(0.5) try: [t.join() for t in threads] # wait for threads to finish except KeyboardInterrupt: print("Exiting threads!") def save_weights(self): print("Saving Weights") self.global_network.save_weights("A3CMarioWeights.h5") def restore_weights(self): print("Restoring Weights!") self.global_network.load_weights("A3CMarioWeights.h5")
def __init__(self): self.num_actions = NUM_ACTIONS # number of actions self.starter_lr = 1e-4 # start value of learning rate # optimizer that trains the global network with the gradients of the locals # use locking because multiple threads self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True) # the global Actor-Critic network self.global_network = Actor_Critic(self.num_actions) # prepare the global network - used to construct the network on eager execution self.global_network(tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32)) self.restore_weights()
class A3CAgent(object): def __init__(self): self.num_actions = NUM_ACTIONS # number of actions self.starter_lr = 1e-4 # start value of learning rate # optimizer that trains the global network with the gradients of the locals # use locking because multiple threads self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True) # the global Actor-Critic network self.global_network = Actor_Critic(self.num_actions) # prepare the global network - used to construct the network on eager execution self.global_network( tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32)) self.restore_weights() def pick_action(self, state, exploration_rate=0.0): if np.random.random() < exploration_rate: return test_env.action_space.sample() # pick randomly state = np.expand_dims(state, axis=0) logits, _ = self.global_network(state) probs = tf.nn.softmax(logits) action = np.random.choice(self.num_actions, 1, p=probs.numpy()[0]) return action[0] def play(self, env, stats, episodes: int = 100, exploration_rate=0.0): rewards_arr = np.zeros(episodes) for episode in range(episodes): episode_reward = 0 done = False state = env.reset() while not done: env.render() # time.sleep(0.05) action = self.pick_action(state, exploration_rate) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state if callable(stats): stats(self, episode_reward) rewards_arr[episode] = episode_reward print(episode_reward) stats.save_stats() return rewards_arr def restore_weights(self): self.global_network.load_weights('A3CPong.h5')
def __init__(self): self.num_state = OBS_SPACE # observation size self.num_actions = NUM_ACTIONS # number of actions self.lr = tf.Variable(3e-4) # variable used for decaying learning rate self.starter_lr = 1e-4 # start value of learning rate # optimizer that trains the global network with the gradients of the locals # use locking because multiple threads self.optimizer = tf.train.AdamOptimizer(learning_rate=self.starter_lr, use_locking=True) # the global Actor-Critic network self.global_network = Actor_Critic(self.num_actions) # prepare the global network - used to construct the network on eager execution self.global_network(tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32)) self.discount_rate = 0.99
def train_thread(agent, max_eps, env, discount_rate, optimizer, statistics: Stats, exploration_rate: AnnealingVariable, number): # create local network and init its weights equal to the global local_network = Actor_Critic(env.action_space.n) # prepare it (must do this when eager execution is enabled) local_network( tf.convert_to_tensor(np.random.random((1, 84, 84, 4)), dtype=tf.float32)) local_network.set_weights(agent.global_network.get_weights()) # lr_decay_anneal = AnnealingVariable(1e-4, 1e-24, 10e6) global episodes # number of total episodes done for all threads # local lists for states, rewards and actions states, rewards, actions = [], [], [] while episodes < max_eps: r_per_episode = 0.0 done = False step = 0 state = env.reset() # still training while not done and episodes < max_eps: exploration_rate.step() # decay the exploration rate states.append( state) # add the observation/state into the state list # find acction to pick according to the probs network and the exploration rate action = pick_action(env, local_network, state, exploration_rate.value) # do the action and observe the next state, reward and if the episode is over next_state, reward, done, _ = env.step(action) # lr_decay_anneal.step() # append the reward experienced in the reward list rewards.append(reward) # append action taken actions.append(action) r_per_episode += reward step += 1 # if gathered enough experience or the episode is over -> train on experience gathered if step % train_frequency == 0 or done: # Gradient tape records the gradient during the evaluation of the loss function # -> eager execution MUST be enabled to work with tf.GradientTape() as tape: # compute loss for each batch of experience loss = compute_loss_from_batch(local_network, states, rewards, actions, done, next_state, discount_rate) # rewind the tape and get the gradients of the loss # for the weights of the local network (Actor-Critic) gradients = tape.gradient(loss, local_network.trainable_weights) # used because multiple threads lock.acquire() # agent.lr.assign(lr_decay_anneal.value) # apply the gradients found from the local network into the global network for the global weights optimizer.apply_gradients( zip(gradients, agent.global_network.trainable_weights)) # update local network with weights of global local_network.set_weights(agent.global_network.get_weights()) lock.release() # empty state, reward, action list states, rewards, actions = [], [], [] state = next_state with lock: # save stats if episodes < max_eps: episodes += 1 statistics(agent, r_per_episode)
actor_lr=3e-4, critic_lr=1e-3, train_actor_iters=80, train_critic_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, seed=0) np.random.seed(config['seed']) env = gym.make(config['env']) obs_space = env.observation_space act_space = env.action_space obs_size = obs_space.shape act_size = act_space.shape ac = Actor_Critic(obs_space, act_space) local_steps_per_epoch = config['steps_per_epoch'] buf = PPO_Buffer(obs_size, act_size, local_steps_per_epoch, config['gamma'], config['lam']) def compute_loss_actor(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] pi, logp = ac.actor(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - config['clip_ratio'], 1 + config['clip_ratio']) * adv loss_actor = -(torch.min(ratio * adv, clip_adv)).mean()
def main(args): # create env env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # follow different logic depending on action space of env hidden_size = args.hidden_size if args.action_space == "continuous": # get env info state_dim = env.observation_space.shape[0] action_dim = env.action_space max_action = (env.action_space.high) min_action = (env.action_space.low) print("number of actions:{0}, dim of states: {1},\ max_action: {2}, min_action: {3}" .format(action_dim,\ state_dim,max_action,min_action)) # create policy policy = Actor_Critic(state_dim, hidden_size,\ action_dim, baseline = args.baseline) elif args.action_space == "discrete": # get env info state_dim = env.observation_space.shape[0] action_dim = env.action_space.n print("number of actions: {0}, dim of states: {1},\ ".format(action_dim, state_dim)) # create policy policy = Actor_Critic_discrete(state_dim, hidden_size,\ action_dim, baseline = args.baseline) else: raise NotImplementedError # setup comet_ml to track experiments if os.path.isfile("settings.json"): with open('settings.json') as f: data = json.load(f) args.comet_apikey = data["apikey"] args.comet_username = data["username"] else: raise NotImplementedError experiment = Experiment(api_key=args.comet_apikey,\ project_name="simple_policy_gradient",auto_output_logging="None",\ workspace=args.comet_username,auto_metric_logging=False,\ auto_param_logging=False) experiment.set_name(args.namestr) args.experiment = experiment # start of experiment: Keep looping until desired amount of episodes reached max_episodes = args.num_episodes total_episodes = 0 # keep track of amount of episodes that we have done while total_episodes < max_episodes: obs = env.reset() done = False trajectory = [] # trajectory info for reinforce update episode_reward = 0 # keep track of rewards per episode while not done: action, ln_prob = policy.select_action(np.array(obs)) next_state, reward, done, _ = env.step(action) trajectory.append( [np.array(obs), action, ln_prob, reward, next_state, done]) obs = next_state episode_reward += reward total_episodes += 1 # update actor/policy and critic/value_network policy_loss, value_loss = policy.train(trajectory) experiment.log_metric("value function loss", value_loss, step=total_episodes) experiment.log_metric("policy loss", policy_loss, step=total_episodes) experiment.log_metric("episode reward", episode_reward, step=total_episodes) if total_episodes % 10 == 0: evaluate_policy(policy, env) env.close()
env = gym.envs.make(env_name) MAX_ACTION = env.action_space.high MIN_ACTION = env.action_space.low ob_dim = env.observation_space.sample().shape[0] ac_dim = env.action_space.sample().shape[0] # MLP function approximators pnet = MLP(2 * ac_dim, pnet_hparams) vnet = MLP(1, vnet_hparams) # actor and critic networks/training graphs in TF actor = TF_CPolicy(pnet, ob_dim, ac_dim, hparams=actor_hparams, min_val=MIN_ACTION, max_val=MAX_ACTION) critic = TF_Value(vnet, ob_dim, hparams=critic_hparams) # change structure of reward fn for car env if env_name == "MountainCarContinuous-v0": def distance_reward(env, reward): return reward - np.abs(env.goal_position - env.state[0]) reward_fn = distance_reward else: reward_fn = None # train and run actor critic ac = Actor_Critic(env, actor, critic, hparams=ac_hparams, reward_fn=reward_fn) ac.train(video=False) for _ in range(5): ac.do_episode(video=True)