def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1)
def __init__(self, env): self.buffer_size = 20000 self.batch_size = 64 self.tau = 1 self.gamma = 0.95 self.learning_rate = 0.001 # Exploration Parameters self.E_start = 1 self.E_end = 0.1 self.E_decay = 0.002 self.episode = 0 self.env = env self.os = self.env.observation_space # self.acs = self.env.action_space self.edim = len(self.os.high) self.adim = self.env.action_space.n self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1) self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.target = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.initial_weights = self.local.model.get_weights() self.target.model.set_weights(self.initial_weights)
def __init__(self, state_shape, num_actions, action_scale=2.0, discount=0.99, tau=0.01, actor_lrate=0.001, critic_lrate=0.01, l2_decay=1e-3, batch_size=64, q_update_iter=1, capacity=1000000): if not isinstance(state_shape, tuple): raise AssertionError('state_shape must be of type <tuple>.') elif len(state_shape) == 0: raise AssertionError('No state space dimensions provided.') elif num_actions == 0: raise ValueError('Number of actions must be > 0.') elif capacity < batch_size: raise ValueError('Replay capacity must be > batch_size.') self.batch_size = batch_size self.q_update_iter = q_update_iter self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions) self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate, tau) self.critic = Critic(state_shape, num_actions, discount, critic_lrate, tau, l2_decay) self.step = 0
class Agent(object): """Implements an agent that follows DDPG algorithm.""" def __init__(self, state_shape, num_actions, action_scale=2.0, discount=0.99, tau=0.01, actor_lrate=0.001, critic_lrate=0.01, l2_decay=1e-3, batch_size=64, q_update_iter=1, capacity=1000000): if not isinstance(state_shape, tuple): raise AssertionError('state_shape must be of type <tuple>.') elif len(state_shape) == 0: raise AssertionError('No state space dimensions provided.') elif num_actions == 0: raise ValueError('Number of actions must be > 0.') elif capacity < batch_size: raise ValueError('Replay capacity must be > batch_size.') self.batch_size = batch_size self.q_update_iter = q_update_iter self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions) self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate, tau) self.critic = Critic(state_shape, num_actions, discount, critic_lrate, tau, l2_decay) self.step = 0 def choose_action(self, state): """Returns an action for the agent to perform in the environment.""" return self.actor.predict(state).flatten() def update_buffer(self, s0, a, r, s1, terminal): """Updates memory replay buffer with new experience.""" self.replay_buffer.update(s0, a, r, s1, terminal) def update_policy(self): """Updates Q-networks using replay memory data + performing SGD""" mb = self.replay_buffer.sample(self.batch_size) # To update the critic, we need a prediction from target policy target_a = self.actor.predict_target(mb[3]) self.critic.train_fn(mb[0], mb[1], mb[3], target_a, mb[2], mb[4]) # Updating the actor requires gradients from critic action = self.actor.predict(mb[0]) grads = self.critic.get_action_grads(mb[0], action) self.actor.train_fn(mb[0], grads) # Every few steps in an episode we update target network weights if self.step == self.q_update_iter: self.actor.update_target() self.critic.update_target() self.step = self.step + 1 if self.step != self.q_update_iter else 0
def main(): env_spec = registry[env_name] env = gym.make(env_spec["id"]) ep_max_steps = env_spec["max_episode_steps"] agent = DDPG(env.observation_space.shape, env.action_space.shape, env.action_space.low[0], env.action_space.high[0]) replay_buffer = ReplayBuffer() state = env.reset() done = False ep_timesteps = 0 ep_reward = 0 ep_num = 0 reward_history = [] for t in range(TOTAL_TIMESTEPS): ep_timesteps += 1 # Select action if t < START_TIMESTEP: action = env.action_space.sample() else: action = agent.select_action(np.array(state)) # Perform action next_state, reward, done, _ = env.step(action) train_done = done and ep_timesteps < ep_max_steps replay_buffer.add( TransitionTuple(state, action, next_state, reward, int(train_done))) state = next_state ep_reward += reward if t >= START_TIMESTEP: agent.train(replay_buffer, BATCH_SIZE) if done: reward_history.append(ep_reward) print( f"[Episode {ep_num+1}, Timestep {t+1}] Total reward: {ep_reward} Total timesteps: {ep_timesteps}" ) state = env.reset() done = False ep_timesteps = 0 ep_reward = 0 ep_num += 1 if RENDER: env.render() # Visualize results if OUTPUT_PLOT: sns.lineplot(x=np.arange(len(reward_history)) + 1, y=reward_history) plt.ylabel("Episode Reward") plt.xlabel("Episode Number") plt.savefig(OUTPUT_PLOT)
def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)
def main(): USE_CUDA = torch.cuda.is_available() env = gym.make('CartPole-v0') dqn = DQN(env.observation_space.shape[0], env.action_space.n) if USE_CUDA: dqn = dqn.cuda() optimizer = optim.RMSprop(dqn.parameters(), lr=0.00025, momentum=0.95, alpha=0.95, eps=0.01) epsilon_schedule = get_epsilon_schedule(start=1.0, end=0.01, endt=1000, learn_start=50) replay_buffer = ReplayBuffer(capacity=1000) agent = DQNAgent(env, dqn, optimizer, epsilon_schedule, replay_buffer, discount_factor=0.99, target_update_rate=10, batch_size=32, learn_start=50) agent.train(5000) total_reward = agent.play(render=True) agent.env.close() print('Total Reward: ', total_reward)
def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs): self.learner_name = self.learner_name() self.interactor_queue = interactor_queue self.learner_lock = lock self.config = config self.env_config = env_config self.learner_config = learner_config self.bonus_kwargs = bonus_kwargs self.kill_threads = False self.permit_desync = False self.need_frames_notification = threading.Condition() self._reset_inspections() self.total_frames = 0 self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"])) self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["log_path"])) + "/%s.log" % self.learner_name # replay buffer to store data self.replay_buffer_lock = threading.RLock() self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"], np.prod(self.env_config["obs_dims"]), self.env_config["action_dim"]) # data loaders pull data from the replay buffer and put it into the tfqueue for model usage self.data_loaders = self.make_loader_placeholders() queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100 self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders]) self.enqueue_op = self.tf_queue.enqueue(self.data_loaders) self.current_batch = self.tf_queue.dequeue() # build the TF graph for the actual model to train self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs): self.learner_name = self.learner_name() self.interactor_queue = interactor_queue self.learner_lock = lock self.config = config self.env_config = env_config self.learner_config = learner_config self.bonus_kwargs = bonus_kwargs self.kill_threads = False self.permit_desync = False self.need_frames_notification = threading.Condition() self._reset_inspections() self.total_frames = 0 self.save_path = util.create_directory( "%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"])) self.log_path = util.create_directory( "%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["log_path"])) + "/%s.log" % self.learner_name # replay buffer to store data self.replay_buffer_lock = threading.RLock() self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"], np.prod(self.env_config["obs_dims"]), self.env_config["action_dim"]) # data loaders pull data from the replay buffer and put it into the tfqueue for model usage self.data_loaders = self.make_loader_placeholders() queue_capacity = np.ceil( 1. / self.learner_config["frames_per_update"] ) if self.learner_config["frames_per_update"] else 100 self.tf_queue = tf.FIFOQueue( capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders]) self.enqueue_op = self.tf_queue.enqueue(self.data_loaders) self.current_batch = self.tf_queue.dequeue() # build the TF graph for the actual model to train self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model( ) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def train( self, env: gym.Env, agent: Agent, network: Network, optimizer, window_size: int, nb_self_play: int, num_unroll_steps: int, td_steps: int, discount: float, batch_size: int, nb_train_update: int, nb_train_epochs: int, max_grad_norm: float, filename: str, ent_c: float, ): replay_buffer = ReplayBuffer(window_size, batch_size) for epoch in range(nb_train_epochs): network.eval() rewards = [] for _ in range(nb_self_play): game_buffer = self._play_one_game(env, agent) # game_buffer.print_buffer() replay_buffer.append(game_buffer) rewards.append(np.sum(game_buffer.rewards)) network.train() losses = [] for _ in range(nb_train_update): batch = replay_buffer.sample_batch(num_unroll_steps, td_steps, discount) losses.append( self._update_weights(network, optimizer, batch, max_grad_norm, ent_c)) v_loss, r_loss, p_loss, entropy = np.mean(losses, axis=0) print( f"Epoch[{epoch+1}]: Reward[{np.mean(rewards)}], Loss: V[{v_loss:.6f}]/R[{r_loss:.6f}]/P[{p_loss:.6f}]/E[{entropy:.6f}]" ) if (epoch + 1) % 10 == 0: agent.save_model(filename)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.2 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf
class DuelingDQN(BaseAgent): def __init__(self, env): self.buffer_size = 20000 self.batch_size = 64 self.tau = 1 self.gamma = 0.95 self.learning_rate = 0.001 # Exploration Parameters self.E_start = 1 self.E_end = 0.1 self.E_decay = 0.002 self.episode = 0 self.env = env self.os = self.env.observation_space # self.acs = self.env.action_space self.edim = len(self.os.high) self.adim = self.env.action_space.n self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1) self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.target = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.initial_weights = self.local.model.get_weights() self.target.model.set_weights(self.initial_weights) def act(self, state, testing): state = state.reshape([1, -1]) actionQs = self.local.model.predict(state) action = np.argmax(actionQs) epsilon = self.E_end + (self.E_start - self.E_end) * np.exp( -self.E_decay * self.episode) if (not testing): if (np.random.rand() < epsilon): action = np.random.choice(self.adim) # action = np.array([action]) return action def learn(self, state, action, reward, next_state, done, testing): # Skip all learning during testing if (testing): return act_index = action self.buffer.add(state, act_index, reward, next_state, done) if (done): self.episode += 1 # TODO When upgrading to RDPG this should be per-episode based states, actions, rewards, next_states, dones = self.buffer.batch( self.batch_size) actions.astype(int) actions = actions.reshape([-1]) rewards = rewards.reshape([-1]) dones = dones.reshape([-1]) # Bellman equation target_Q = rewards + self.gamma * np.amax( self.target.model.predict_on_batch(next_states), axis=1) * (1 - dones) self.local.train([states, actions, target_Q]) self.soft_update(self.target, self.local) def soft_update(self, target, local): local_weights = np.array(local.model.get_weights()) target_weights = np.array(target.model.get_weights()) new_target_weights = ( 1 - self.tau) * target_weights + self.tau * local_weights target.model.set_weights(new_target_weights) def reset(self): shuffle_weights(self.local.model, self.initial_weights) self.target.model.set_weights(self.local.model.get_weights()) self.episode = 0
def train(args): env, discrete = init_environment(env_name=args.env_name) thresh = env.spec.reward_threshold print("Starting training! Need {} to solve".format(thresh)) print(env) # print(env.observation_space["observation"].shape[0]) seed_random(env, args.rand_seed) device = init_device() writer = init_logger(log_dir=args.log_dir) sac = SAC(env, device, at=args.alph_tune, dis=discrete).to(device) sac.init_opt(lr=args.learning_rate) reward_history = [] eval_history = [] reward_cum = 0 max_reward = -float("inf") act_size = sac.actor.action_space replay = ReplayBuffer(args.buff_size, sac.actor.state_space, act_size) step = 0 episode = 0 while step < args.steps and episode < args.num_episodes: # Reset environment and record the starting state state = env.reset() reward_cum = 0 done = False time = 0 while not done and time < args.time_limit: state = torch.from_numpy(state).float().to(device) action = sac.get_action(state) next_state, reward, done, _ = env.step(action) if sac.discrete: action = get_one_hot_np(action, sac.soft_q1.action_space) replay.store(state.cpu(), action, next_state, reward, done) state = next_state reward_cum += reward step += 1 time += 1 if len(replay) > args.batch_size: update_SAC( sac, replay, step, writer, batch_size=args.batch_size, ) if step > 0 and step % args.eval_freq == 0: print("Evaluating") sac.eval() num = step / args.eval_freq curr_reward = evaluate_SAC(args, env, sac, writer, step) eval_history.append((num, curr_reward)) if curr_reward > max_reward: print("Saving model...") max_reward = curr_reward sac.save() print("Steps {} Eval Reward {:.2f}".format(step, curr_reward)) sac.train() # Calculate score to determine when the environment has been solved reward_history.append(reward_cum) mean_score = np.mean(reward_history[-100:]) if writer is not None: writer.add_scalar("stats/reward", reward_cum, step) writer.add_scalar("stats/avg_reward", mean_score, step) print( "Episode {} Steps {} Reward {:.2f} Avg reward {:.2f}".format( episode, step, reward_history[-1], mean_score ) ) episode += 1 if thresh is not None and mean_score > thresh: print("Solved after {} episodes!".format(episode)) print("And {} environment steps".format(step)) break fname = "results.out" data = np.array(reward_history) np.savetxt(fname, data) plot_success(reward_history)
num_episode = 5000 epsilon = 1 env = gym.make('Pendulum-v0') num_action = env.action_space.shape[0] num_state = env.observation_space.shape[0] np.random.seed(123) sess = tf.Session() from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, num_state, num_action, batch_size, tau, actor_alpha) critic = CriticNetwork(sess, num_state, num_action, batch_size, tau, critic_alpha) buff = ReplayBuffer(buffer_size) with open('actor_model.json', 'w') as json_file: json_file.write(actor.model.to_json()) with open('critic_model.json', 'w') as json_file: json_file.write(critic.model.to_json()) print 'start training' best_r = -10000 actor.update_target_network() critic.update_target_network() try: for i in range(num_episode): total_reward = 0
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0
value_criterion = nn.MSELoss() soft_q_criterion1 = nn.MSELoss() soft_q_criterion2 = nn.MSELoss() value_lr = 3e-4 soft_q_lr = 3e-4 policy_lr = 3e-4 value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr) soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr) soft_q_optimizer2 = optim.Adam(soft_q_net2.parameters(), lr=soft_q_lr) policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) replay_buffer_size = 1000000 replay_buffer = ReplayBuffer(replay_buffer_size) max_frames = 40000 max_steps = 500 frame_idx = 0 rewards = [] batch_size = 128 while frame_idx < max_frames: state = env.reset() episode_reward = 0 for step in range(max_steps): if frame_idx > 1000: action = policy_net.get_action(state).detach()
class Learner(object): """ Generic object which runs the main training loop of anything that trains using a replay buffer. Handles updating, logging, saving/loading, batching, etc. """ def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs): self.learner_name = self.learner_name() self.interactor_queue = interactor_queue self.learner_lock = lock self.config = config self.env_config = env_config self.learner_config = learner_config self.bonus_kwargs = bonus_kwargs self.kill_threads = False self.permit_desync = False self.need_frames_notification = threading.Condition() self._reset_inspections() self.total_frames = 0 self.save_path = util.create_directory( "%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"])) self.log_path = util.create_directory( "%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["log_path"])) + "/%s.log" % self.learner_name # replay buffer to store data self.replay_buffer_lock = threading.RLock() self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"], np.prod(self.env_config["obs_dims"]), self.env_config["action_dim"]) # data loaders pull data from the replay buffer and put it into the tfqueue for model usage self.data_loaders = self.make_loader_placeholders() queue_capacity = np.ceil( 1. / self.learner_config["frames_per_update"] ) if self.learner_config["frames_per_update"] else 100 self.tf_queue = tf.FIFOQueue( capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders]) self.enqueue_op = self.tf_queue.enqueue(self.data_loaders) self.current_batch = self.tf_queue.dequeue() # build the TF graph for the actual model to train self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model( ) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) ## Mandatory functions to override def learner_name(self): raise Exception('unimplemented: learner_name') def make_loader_placeholders(self): raise Exception('unimplemented: make_loader_placeholders') def make_core_model(self): raise Exception('unimplemented: make_core_model') ## Optional functions to override def initialize(self): warnings.warn('unimplemented: initialize') def resume_from_checkpoint(self, epoch): warnings.warn('unimplemented: resume_from_checkpoint') def checkpoint(self): warnings.warn('unimplemented: checkpoint') def backup(self): warnings.warn('unimplemented: backup') ## Internal functions def _start(self): # fetch data from the interactors to pre-fill the replay buffer self.prefetch_thread = threading.Thread( target=self._poll_interactors, args=( True, self.learner_config["frames_before_learning"], )) self.prefetch_thread.start() self.prefetch_thread.join() # start the interactor and data loader self.data_load_thread = threading.Thread(target=self._run_enqueue_data) self.data_load_thread.start() # initialize the learner, pretraining if needed if self.config["resume"]: self._resume_from_checkpoint() else: self._initialize() # re-sync everything, and start up interactions with the environment self.interactor_poll_thread = threading.Thread( target=self._poll_interactors) self.interactor_poll_thread.start() # start the clock self._last_checkpoint_time = time.time() def _learn(self, permit_desync=False, log=True, checkpoint=True, backup=True): # this is to keep the frames/update synced properly if self.learner_config[ "frames_per_update"] is not False and not permit_desync: if not self._have_enough_frames(): with self.need_frames_notification: self.need_frames_notification.notify() return # log if log and (self.update_i + 1) % self.learner_config["log_every_n"] == 0: self._log() # checkpoint if checkpoint and (self.update_i + 1) % self.learner_config["epoch_every_n"] == 0: self._checkpoint() # backup if backup and (self.update_i + 1) % self.learner_config["backup_every_n"] == 0: self._backup() # train self._training_step() def _have_enough_frames(self): gathered_frames = self.total_frames - self.learner_config[ "frames_before_learning"] return gathered_frames > self.learner_config[ "frames_per_update"] * self.update_i def _initialize(self): self.epoch = 0 self.update_i = 0 self.hours = 0 self._last_checkpoint_time = time.time() self.initialize() if self.learner_config["pretrain_n"]: self._pretrain() self._checkpoint() def _pretrain(self): for _ in range(self.learner_config["pretrain_n"]): self._learn(permit_desync=True, checkpoint=False, backup=False) self.epoch = 0 self.update_i = 0 def _resume_from_checkpoint(self): epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid) if not self.config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid) if epoch is False: raise Exception("Tried to reload but no model found") with self.learner_lock: self.core.load(self.sess, self.save_path, epoch) self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run( [ self.core.epoch_n, self.core.update_n, self.core.frame_n, self.core.hours ]) with self.replay_buffer_lock: self.replay_buffer.load(self.save_path, '%09d_%s' % (epoch, self.learner_name)) self.resume_from_checkpoint(epoch) def _log(self): logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % ( time.time() - self._log_time, self.hours, self.epoch, self.update_i + 1, self.total_frames) + ', '.join([ "%8f" % x for x in (self.running_total / self.denom).tolist() ]) print("%s\t%s" % (self.learner_name, logstring)) with open(self.log_path, "a") as f: f.write(logstring + "\n") self._reset_inspections() def _reset_inspections(self): self.running_total = 0. self.denom = 0. self._log_time = time.time() def _checkpoint(self): self.checkpoint() self.epoch += 1 self.hours += (time.time() - self._last_checkpoint_time) / 3600. self._last_checkpoint_time = time.time() self.core.update_epoch(self.sess, self.epoch, self.update_i, self.total_frames, self.hours) with self.learner_lock: self.core.save(self.sess, self.save_path) def _backup(self): self.backup() if not self.learner_config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid) with self.learner_lock: self.core.save(self.sess, self.save_path, self.epoch) with self.replay_buffer_lock: self.replay_buffer.save( self.save_path, '%09d_%s' % (self.epoch, self.learner_name)) def _training_step(self): train_ops = tuple([ op for op, loss in zip(self.train_ops, self.train_losses) if loss is not None ]) outs = self.sess.run(train_ops + self.inspect_losses) self.running_total += np.array(outs[len(train_ops):]) self.denom += 1. self.update_i += 1 def _poll_interactors(self, continuous_poll=False, frames_before_terminate=None): # poll the interactors for new frames. # the synced_condition semaphore prevents this from consuming too much CPU while not self.kill_threads: if self.learner_config[ "frames_per_update"] is not False and not continuous_poll: with self.need_frames_notification: self.need_frames_notification.wait() while not self.interactor_queue.empty(): new_frames = self.interactor_queue.get() self._add_frames(new_frames) if frames_before_terminate and self.total_frames >= frames_before_terminate: return def _add_frames(self, frames): with self.replay_buffer_lock: for frame in frames: self.replay_buffer.add_replay(*frame) self.total_frames = self.replay_buffer.count return self.total_frames def _run_enqueue_data(self): while not self.kill_threads: data = self.replay_buffer.random_batch( self.learner_config["batch_size"]) self.sess.run(self.enqueue_op, feed_dict=dict(list(zip(self.data_loaders, data)))) def _kill_threads(self): self.kill_threads = True
DISPLAY_Q_VALUES = True DISPLAY_VAL_CHART = True DISPLAY_HEATMAP = True game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS) print("The environment has the following {} actions: {}".format( game_wrapper.env.action_space.n, game_wrapper.env.unwrapped.get_action_meanings())) MAIN_DQN = build_q_network(game_wrapper.env.action_space.n, LEARNING_RATE, input_shape=INPUT_SHAPE) TARGET_DQN = build_q_network(game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE) replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE) agent = Agent(MAIN_DQN, TARGET_DQN, replay_buffer, game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE) print('Loading agent...') agent.load(RESTORE_PATH) def display_nparray(arr, maxwidth=500): assert len(arr.shape) == 3 height, width, _channels = arr.shape
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed, device, lr_actor, lr_critic, weight_decay_critic, batch_size, buffer_size, gamma, tau, update_every, n_updates, eps_start, eps_end, eps_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.t_step = 0 self.device = device self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay_critic = weight_decay_critic self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.tau = tau self.update_every = update_every self.n_updates = n_updates self.eps = eps_start self.eps_end = eps_end self.eps_decay = eps_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at interval settings if len(self.memory) > self.batch_size: if self.t_step % self.update_every == 0: for _ in range(self.n_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma, agent_number) def act(self, states, add_noise): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # Update epsilon noise value self.eps = max(self.eps_end, self.eps_decay*self.eps) # self.eps = self.eps - (1/self.eps_decay) # if self.eps < self.eps_end: # self.eps = self.eps_end def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(env, estimator, target_network, num_episodes=1000, replay_memory_size=500000, frame_history_len=4, save_every=10, update_every=1000, discount=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, batch_size=32, record_every=50): """ deep q learning algorithm :param env: openAI gym environment :param estimator: estimator model for predicting values :param target_network: :param num_episodes: number of episodes to run :param replay_memory_size: size of replay memory :param update_every: copy params from estimator into target estimator after this many steps :param discount: discount factor :param epsilon_start: starting epsilon value :param epsilon_end: ending epsilon value :param batch_size: 32 lol :param record_every: record a video every N episodes :return: """ # Load previous state here replay_memory = ReplayBuffer(replay_memory_size, frame_history_len) # epsilon delay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) loss_func = nn.SmoothL1Loss() optimizer = torch.optim.Adam(estimator.parameters()) policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS)) env = Monitor(env, directory="./monitor", resume=True, video_callable=lambda count: count % record_every == 0) total_t = 0 pbar = tqdm(range(num_episodes)) pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0)) for ep in pbar: state = env.reset() # 210 x 160 x 4 state = process_state(state) # 94 x 94 x 3 episode_loss = 0 episode_reward = 0 episode_t = 0 for t in itertools.count(): epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] last_idx = replay_memory.store_frame(state) recent_observations = replay_memory.encode_recent_observation() action_dist = policy(recent_observations, epsilon) action_dist = action_dist.squeeze(0).numpy() action = np.random.choice(np.arange(len(action_dist)), p=action_dist) next_state, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) episode_reward += reward replay_memory.store_effect(last_idx, action, reward, done) next_state = process_state(next_state) state = next_state if replay_memory.can_sample(batch_size): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size) obs_batch = torch.from_numpy(obs_batch).float() obs_batch = obs_batch.to(device) act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0 rew_batch = torch.from_numpy(rew_batch).to(device) next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).float().to(device) state_values = estimator(obs_batch) # b x VALID_ACTIONS state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1)) # b x 1 next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0] next_state_values = not_done_mask * next_state_values_max expected_q_value = (next_state_values * discount) + rew_batch # bellman_error = expected_q_value - state_action_values.squeeze(1) # # clipped_bellman_error = bellman_error.clamp(-1, 1) # # d_error = clipped_bellman_error * -1.0 loss = loss_func(state_action_values, expected_q_value.unsqueeze(1)) episode_loss += loss # state_action_values.backward(d_error.data.unsqueeze(1)) optimizer.zero_grad() loss.backward() optimizer.step() if done: break total_t += 1 episode_t = t pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer)) if total_t % update_every == 0: copy_model_params(estimator, target_network) # save checkpoint if ep % save_every == 0: torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt') env.close()
class Agent(object): """DQN Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, replay_buffer_size=int(1e5), batch_size=64, discount_factor=0.99, soft_update=1e-3, learning_rate=5e-4, update_every=4, **kwargs): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations replay_buffer_size (int): Size of replay buffer batch_size (int): Size of experience batches during training discount_factor (float): Discount factor (gamma) soft_update (float): Soft update coefficient (tau) learning_rate (float): Learning rate (alpha) update_every (int): Steps between updating the network **kwargs: Arguments describing the QNetwork """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" # Parameters self.batch_size = batch_size """Size of experience batches during training""" self.discount_factor = discount_factor """Discount factor (gamma)""" self.soft_update = soft_update """Soft update coefficient (tau)""" self.update_every = update_every """Steps between updating the network""" # Q Networks self.target_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Target Q-Network""" self.local_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Local Q-Network""" self.optimizer = optim.Adam(self.local_network.parameters(), lr=learning_rate) """Optimizer used when training the Q-network.""" # Memory self.memory = ReplayBuffer(replay_buffer_size, batch_size, device) # Time step self.t_step = 0 """Current time step""" def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" self.local_network.save_weights(path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" self.local_network.load_weights(path) def act(self, state, eps=0.): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state eps (float): Probability of selecting random action (epsilon) Returns: int: Epsilon-greedily selected action """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.local_network.eval() with torch.no_grad(): action_values = self.local_network(state) self.local_network.train() # Select action epsilon-greedily if random.random() > eps: return np.argmax(action_values.cpu().detach().numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn if at update_every steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check that we have enough stored experiences if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Update Q-network using given experiences Args: experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: SARS'+done tuple """ states, actions, rewards, next_states, dones = experiences # Predicted Q values from target model for next states # (NB. torch.max returns tuple (max, argmax) q_target_next = self.target_network(next_states).max(dim=1, keepdim=True)[0] # Computed target Q values for current state q_target = rewards + self.discount_factor * q_target_next * (1 - dones) # Predicted Q values from local model for current state q_local = self.local_network(states).gather(dim=1, index=actions) loss = F.mse_loss(q_local, q_target) # Update local network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network soft_update(self.local_network, self.target_network, self.soft_update)
game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS) print("The environment has the following {} actions: {}".format( game_wrapper.env.action_space.n, game_wrapper.env.unwrapped.get_action_meanings())) writer = tf.summary.create_file_writer(TENSORBOARD_DIR) MAIN_DQN = build_q_network(game_wrapper.env.action_space.n, LEARNING_RATE, input_shape=INPUT_SHAPE) TARGET_DQN = build_q_network(game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE) replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE, use_per=USE_PER) agent = Agent(MAIN_DQN, TARGET_DQN, replay_buffer, game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE, batch_size=BATCH_SIZE, use_per=USE_PER) if LOAD_FROM is None: frame_number = 0 rewards = [] loss_list = [] else: print('Loading from', LOAD_FROM)
def main(config, max_samples): get_env_parameters(config) log_dir = "logs/scalars/" + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") file_writer = tf.summary.create_file_writer(log_dir + "/metrics") file_writer.set_as_default() config['log_dir'] = log_dir ray.init() parameter_server = ParameterServer.remote(config) replay_buffer = ReplayBuffer.remote(config) learner = Learner.remote(config, replay_buffer, parameter_server) training_actor_ids = [] eval_actor_ids = [] learner.start_learning.remote() # Create training actors for i in range(config["num_workers"]): eps = config["max_eps"] * i / config["num_workers"] actor = Actor.remote("train-" + str(i), replay_buffer, parameter_server, config, eps) actor.sample.remote() training_actor_ids.append(actor) # Create eval actors for i in range(config["eval_num_workers"]): eps = 0 actor = Actor.remote("eval-" + str(i), replay_buffer, parameter_server, config, eps, True) eval_actor_ids.append(actor) total_samples = 0 best_eval_mean_reward = np.NINF eval_mean_rewards = [] while total_samples < max_samples: tsid = replay_buffer.get_total_env_samples.remote() new_total_samples = ray.get(tsid) if (new_total_samples - total_samples >= config["timesteps_per_iteration"]): total_samples = new_total_samples print("Total samples:", total_samples) parameter_server.set_eval_weights.remote() eval_sampling_ids = [] for eval_actor in eval_actor_ids: sid = eval_actor.sample.remote() eval_sampling_ids.append(sid) eval_rewards = ray.get(eval_sampling_ids) print("Evaluation rewards: {}".format(eval_rewards)) eval_mean_reward = np.mean(eval_rewards) eval_mean_rewards.append(eval_mean_reward) print("Mean evaluation reward: {}".format(eval_mean_reward)) tf.summary.scalar('Mean evaluation reward', data=eval_mean_reward, step=total_samples) if eval_mean_reward > best_eval_mean_reward: print("Model has improved! Saving the model!") best_eval_mean_reward = eval_mean_reward parameter_server.save_eval_weights.remote() print("Finishing the training.") for actor in training_actor_ids: actor.stop.remote() learner.stop.remote()
class DDPG: def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0): for i in range(self.config.EPOCH): experiences = self.memory.sample(self.config.BATCH_SIZE) self.learn(experiences) def reset(self): self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.critic_target(next_states, self.actor_target(next_states)) Q_targets = rewards + (self.config.GAMMA * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_loss = -self.critic_local(states, self.actor_local(states)).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(sess, env, actor, critic, noise, reward, discrete): # set up summary writer summary_write = tf.summary.FileWriter("ddpg_summary") sess.run(tf.global_variables_initializer()) # initialize target and critic network actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # initialize noise ou_level = 0. for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 episode_buffer = np.empty((0, 5), float) for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Add exploration noise if i < NOISE_MAX_EP: ou_level = noise.ornstein_uhlenbeck_level(ou_level) a = a + ou_level # Set action for discrete and continuous action spaces if discrete: action = np.argmax(a) else: action = a[0] s2, r, terminal, info = env.step(action) # Choose reward type ep_reward += r episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) # Adding experience to memory if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targes predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.max(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Set previous state for next step s = s2 if terminal: # Reward system for episode # episode_buffer = reward.discount(episode_buffer) # Add episode to replay for step in episode_buffer: replay_buffer.add(np.reshape(step[0], (actor.s_dim, )), np.reshape(step[1], (actor.a_dim, )), step[2], step[3], np.reshape(step[4], (actor.s_dim, ))) # summary = tf.summary() # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward)) # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j))) # summary_writer.add_summary(summary, i) # summary_writer.flush() if i != 0: print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % ( int(ep_reward), i, (ep_ave_max_q / float(i))) break
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.2 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Save reward self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: # Keeping track of the score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def __init__(self, state_size, action_size, device, replay_buffer_size=int(1e5), batch_size=64, discount_factor=0.99, soft_update=1e-3, learning_rate=5e-4, update_every=4, **kwargs): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations replay_buffer_size (int): Size of replay buffer batch_size (int): Size of experience batches during training discount_factor (float): Discount factor (gamma) soft_update (float): Soft update coefficient (tau) learning_rate (float): Learning rate (alpha) update_every (int): Steps between updating the network **kwargs: Arguments describing the QNetwork """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" # Parameters self.batch_size = batch_size """Size of experience batches during training""" self.discount_factor = discount_factor """Discount factor (gamma)""" self.soft_update = soft_update """Soft update coefficient (tau)""" self.update_every = update_every """Steps between updating the network""" # Q Networks self.target_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Target Q-Network""" self.local_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Local Q-Network""" self.optimizer = optim.Adam(self.local_network.parameters(), lr=learning_rate) """Optimizer used when training the Q-network.""" # Memory self.memory = ReplayBuffer(replay_buffer_size, batch_size, device) # Time step self.t_step = 0 """Current time step"""
processes = [ multiprocessing.Process(target=environment_process, args=(port, ), daemon=True) for i in range(NUM_ENVIRONMENTS) ] for p in processes: p.start() step = 0 start = time.time() recent_steps = [] recent_total_reward = [] recent_collisions = [] recent_values = [] recent_stats = [[] for _ in range(5)] replay_buffer = ReplayBuffer(REPLAY_MAX) while True: # Read request and process. request = socket.recv_pyobj() instruction = request[0] instruction_data = request[1:] if instruction == 'CALL_GENERATOR': if len(replay_buffer) < REPLAY_MIN: if TASK in ['DriveStraight', 'DriveHard']: params = rand_macro_action_set_drive_straight(7) else: params = rand_macro_action_set(8, 3)
def train(env, config): """ Execute training of Soft Actor Critic """ timesteps_elapsed = 0 episodes_elapsed = 0 STATE_SIZE = env.observation_space.shape[0] ACTION_SIZE = env.action_space.n #policy_net = Net(sizes=(STATE_SIZE, *config["hidden_size"], ACTION_SIZE)).to(device=config["device"]) policy_net = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"]) policy_net.apply(init_weights) Q_net1 = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"]) Q_net1.apply(init_weights) Q_net2 = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"]) Q_net2.apply(init_weights) Q_target_net1 = copy.deepcopy(Q_net1) Q_target_net2 = copy.deepcopy(Q_net2) Q_target_net1.freeze() Q_target_net2.freeze() log_alpha = nn.Parameter( torch.tensor([math.log(config["alpha"])], device=config["device"])) entropy_target = -math.log(1 / ACTION_SIZE) * config[ "target_entropy_ratio"] # that is, maximum entropy times a ratio optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config["learning_rate_policy"]) optimizer_q = torch.optim.Adam(list(Q_net1.parameters()) + list(Q_net2.parameters()), lr=config["learning_rate_value"]) optimizer_alpha = torch.optim.Adam([log_alpha], lr=config["learning_rate_alpha"], eps=1e-4) replay_buffer = ReplayBuffer(config["buffer_capacity"]) n_step_buffer = NstepBuffer(config["n_steps"]) eval_returns_all = [] eval_times_all = [] train_policy = False start_time = time.time() with tqdm(total=config["max_timesteps"]) as pbar: while timesteps_elapsed < config["max_timesteps"]: elapsed_seconds = time.time() - start_time if elapsed_seconds > config["max_time"]: pbar.write("Training ended after {}s.".format(elapsed_seconds)) break episode_timesteps, _ = play_episode( env, policy_net=policy_net, Q_net1=Q_net1, Q_net2=Q_net2, Q_target_net1=Q_target_net1, Q_target_net2=Q_target_net2, optimizer_policy=optimizer_policy, optimizer_q=optimizer_q, optimizer_alpha=optimizer_alpha, log_alpha=log_alpha, entropy_target=entropy_target, gamma=config["gamma"], replay_buffer=replay_buffer, n_step_buffer=n_step_buffer, train=True, train_policy=train_policy, render=config["render"], max_steps=config["episode_length"], steps_init_training=config["steps_init_training"], steps_per_learning_update=config["steps_per_learning_update"], batch_size=config["batch_size"], device=config["device"]) timesteps_elapsed += episode_timesteps episodes_elapsed += 1 pbar.update(episode_timesteps) if timesteps_elapsed % config[ "train_policy_freq"] < episode_timesteps: train_policy = True else: train_policy = False if timesteps_elapsed > config["steps_init_training"]: if (timesteps_elapsed - config["steps_init_training"] ) % config["target_update_freq"] < episode_timesteps: Q_target_net1.soft_update(Q_net1, 0.8) Q_target_net2.soft_update(Q_net2, 0.8) if timesteps_elapsed % config["eval_freq"] < episode_timesteps: eval_returns = 0 for _ in range(config["eval_episodes"]): _, episode_return = play_episode( env, policy_net, Q_net1, Q_net2, Q_target_net1, Q_target_net2, optimizer_policy, optimizer_q, optimizer_alpha=optimizer_alpha, log_alpha=log_alpha, entropy_target=entropy_target, gamma=config["gamma"], replay_buffer=replay_buffer, n_step_buffer=n_step_buffer, train=False, train_policy=train_policy, render=config["render"], max_steps=config["episode_length"], batch_size=config["batch_size"], device=config["device"]) eval_returns += episode_return eval_returns = eval_returns / config["eval_episodes"] eval_returns_all.append(eval_returns) pbar.write( "Evaluation at timestep {} and episode {} returned a mean returns of {}" .format(timesteps_elapsed, episodes_elapsed, eval_returns)) if eval_returns >= config["target_return"]: pbar.write( "Reached return {} >= target return of {}".format( eval_returns, config["target_return"])) break print("Saving policy to {}".format(config["save_filename"])) torch.save(policy_net, config["save_filename"]) return np.array(eval_returns_all)
class Agent(object): """DDPG Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device) def reset(self): """Reset state of agent.""" self.noise.reset() def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" torch.save( { 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict() }, path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" checkpoint = torch.load(path) self.actor_local.load_state_dict(checkpoint['actor_local']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic_local.load_state_dict(checkpoint['critic_local']) self.critic_target.load_state_dict(checkpoint['critic_target']) def act(self, state, add_noise=True): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state Returns: action (np.ndarray): Action tuple """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().detach().numpy() # Resume training mode self.actor_local.train() # Add noise if exploring if add_noise: action += self.noise.sample() # The noise might take us out of range action = np.clip(action, -1, 1) return action def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn as soon as we have enough stored experiences self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Learn from batch of experiences.""" states, actions, rewards, next_states, dones = experiences # region Update Critic actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) q_targets = rewards + (GAMMA * q_targets_next * (1 - dones)) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) self.critic_optimizer.step() # endregion # region Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # endregion # Update target networks soft_update(self.critic_local, self.critic_target, TAU) soft_update(self.actor_local, self.actor_target, TAU)
def __init__(self, state_size, action_size, num_agents, random_seed, device, lr_actor, lr_critic, weight_decay_critic, batch_size, buffer_size, gamma, tau, update_every, n_updates, eps_start, eps_end, eps_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.t_step = 0 self.device = device self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay_critic = weight_decay_critic self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.tau = tau self.update_every = update_every self.n_updates = n_updates self.eps = eps_start self.eps_end = eps_end self.eps_decay = eps_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)
class Learner(object): """ Generic object which runs the main training loop of anything that trains using a replay buffer. Handles updating, logging, saving/loading, batching, etc. """ def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs): self.learner_name = self.learner_name() self.interactor_queue = interactor_queue self.learner_lock = lock self.config = config self.env_config = env_config self.learner_config = learner_config self.bonus_kwargs = bonus_kwargs self.kill_threads = False self.permit_desync = False self.need_frames_notification = threading.Condition() self._reset_inspections() self.total_frames = 0 self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"])) self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["log_path"])) + "/%s.log" % self.learner_name # replay buffer to store data self.replay_buffer_lock = threading.RLock() self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"], np.prod(self.env_config["obs_dims"]), self.env_config["action_dim"]) # data loaders pull data from the replay buffer and put it into the tfqueue for model usage self.data_loaders = self.make_loader_placeholders() queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100 self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders]) self.enqueue_op = self.tf_queue.enqueue(self.data_loaders) self.current_batch = self.tf_queue.dequeue() # build the TF graph for the actual model to train self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) ## Mandatory functions to override def learner_name(self): raise Exception('unimplemented: learner_name') def make_loader_placeholders(self): raise Exception('unimplemented: make_loader_placeholders') def make_core_model(self): raise Exception('unimplemented: make_core_model') ## Optional functions to override def initialize(self): warnings.warn('unimplemented: initialize') def resume_from_checkpoint(self, epoch): warnings.warn('unimplemented: resume_from_checkpoint') def checkpoint(self): warnings.warn('unimplemented: checkpoint') def backup(self): warnings.warn('unimplemented: backup') ## Internal functions def _start(self): # fetch data from the interactors to pre-fill the replay buffer self.prefetch_thread = threading.Thread(target=self._poll_interactors, args=(True, self.learner_config["frames_before_learning"],)) self.prefetch_thread.start() self.prefetch_thread.join() # start the interactor and data loader self.data_load_thread = threading.Thread(target=self._run_enqueue_data) self.data_load_thread.start() # initialize the learner, pretraining if needed if self.config["resume"]: self._resume_from_checkpoint() else: self._initialize() # re-sync everything, and start up interactions with the environment self.interactor_poll_thread = threading.Thread(target=self._poll_interactors) self.interactor_poll_thread.start() # start the clock self._last_checkpoint_time = time.time() def _learn(self, permit_desync=False, log=True, checkpoint=True, backup=True): # this is to keep the frames/update synced properly if self.learner_config["frames_per_update"] is not False and not permit_desync: if not self._have_enough_frames(): with self.need_frames_notification: self.need_frames_notification.notify() return # log if log and (self.update_i + 1) % self.learner_config["log_every_n"] == 0: self._log() # checkpoint if checkpoint and (self.update_i + 1) % self.learner_config["epoch_every_n"] == 0: self._checkpoint() # backup if backup and (self.update_i + 1) % self.learner_config["backup_every_n"] == 0: self._backup() # train self._training_step() def _have_enough_frames(self): gathered_frames = self.total_frames - self.learner_config["frames_before_learning"] return gathered_frames > self.learner_config["frames_per_update"] * self.update_i def _initialize(self): self.epoch = 0 self.update_i = 0 self.hours = 0 self._last_checkpoint_time = time.time() self.initialize() if self.learner_config["pretrain_n"]: self._pretrain() self._checkpoint() def _pretrain(self): for _ in range(self.learner_config["pretrain_n"]): self._learn(permit_desync=True, checkpoint=False, backup=False) self.epoch = 0 self.update_i = 0 def _resume_from_checkpoint(self): epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid) if not self.config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid) if epoch is False: raise Exception("Tried to reload but no model found") with self.learner_lock: self.core.load(self.sess, self.save_path, epoch) self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run([self.core.epoch_n, self.core.update_n, self.core.frame_n, self.core.hours]) with self.replay_buffer_lock: self.replay_buffer.load(self.save_path, '%09d_%s' % (epoch, self.learner_name)) self.resume_from_checkpoint(epoch) def _log(self): if self.denom > 0: logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % (time.time() - self._log_time, self.hours, self.epoch, self.update_i + 1, self.total_frames) + ', '.join(["%8f" % x for x in (self.running_total / self.denom).tolist()]) print("%s\t%s" % (self.learner_name, logstring)) with open(self.log_path, "a") as f: f.write(logstring + "\n") self._reset_inspections() def _reset_inspections(self): self.running_total = 0. self.denom = 0. self._log_time = time.time() def _checkpoint(self): self.checkpoint() self.epoch += 1 self.hours += (time.time() - self._last_checkpoint_time) / 3600. self._last_checkpoint_time = time.time() self.core.update_epoch(self.sess, self.epoch, self.update_i, self.total_frames, self.hours) with self.learner_lock: self.core.save(self.sess, self.save_path) def _backup(self): self.backup() if not self.learner_config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid) with self.learner_lock: self.core.save(self.sess, self.save_path, self.epoch) with self.replay_buffer_lock: self.replay_buffer.save(self.save_path, '%09d_%s' % (self.epoch, self.learner_name)) def _training_step(self): train_ops = tuple([op for op, loss in zip(self.train_ops, self.train_losses) if loss is not None]) outs = self.sess.run(train_ops + self.inspect_losses) self.running_total += np.array(outs[len(train_ops):]) self.denom += 1. self.update_i += 1 def _poll_interactors(self, continuous_poll=False, frames_before_terminate=None): # poll the interactors for new frames. # the synced_condition semaphore prevents this from consuming too much CPU while not self.kill_threads: if self.learner_config["frames_per_update"] is not False and not continuous_poll: with self.need_frames_notification: self.need_frames_notification.wait() while not self.interactor_queue.empty(): new_frames = self.interactor_queue.get() self._add_frames(new_frames) if frames_before_terminate and self.total_frames >= frames_before_terminate: return def _add_frames(self, frames): with self.replay_buffer_lock: for frame in frames: self.replay_buffer.add_replay(*frame) self.total_frames = self.replay_buffer.count return self.total_frames def _run_enqueue_data(self): while not self.kill_threads: data = self.replay_buffer.random_batch(self.learner_config["batch_size"]) self.sess.run(self.enqueue_op, feed_dict=dict(list(zip(self.data_loaders, data)))) def _kill_threads(self): self.kill_threads = True