Esempio n. 1
0
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
Esempio n. 2
0
    def __init__(self, env):
        self.buffer_size = 20000
        self.batch_size = 64
        self.tau = 1
        self.gamma = 0.95
        self.learning_rate = 0.001

        # Exploration Parameters
        self.E_start = 1
        self.E_end = 0.1
        self.E_decay = 0.002
        self.episode = 0

        self.env = env
        self.os = self.env.observation_space
        #         self.acs = self.env.action_space

        self.edim = len(self.os.high)
        self.adim = self.env.action_space.n

        self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1)

        self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate)
        self.target = DuelingDQN_Model(self.edim, self.adim,
                                       self.learning_rate)

        self.initial_weights = self.local.model.get_weights()
        self.target.model.set_weights(self.initial_weights)
Esempio n. 3
0
    def __init__(self,
                 state_shape,
                 num_actions,
                 action_scale=2.0,
                 discount=0.99,
                 tau=0.01,
                 actor_lrate=0.001,
                 critic_lrate=0.01,
                 l2_decay=1e-3,
                 batch_size=64,
                 q_update_iter=1,
                 capacity=1000000):

        if not isinstance(state_shape, tuple):
            raise AssertionError('state_shape must be of type <tuple>.')
        elif len(state_shape) == 0:
            raise AssertionError('No state space dimensions provided.')
        elif num_actions == 0:
            raise ValueError('Number of actions must be > 0.')
        elif capacity < batch_size:
            raise ValueError('Replay capacity must be > batch_size.')

        self.batch_size = batch_size
        self.q_update_iter = q_update_iter
        self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions)
        self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate,
                           tau)
        self.critic = Critic(state_shape, num_actions, discount, critic_lrate,
                             tau, l2_decay)
        self.step = 0
Esempio n. 4
0
class Agent(object):
    """Implements an agent that follows DDPG algorithm."""
    def __init__(self,
                 state_shape,
                 num_actions,
                 action_scale=2.0,
                 discount=0.99,
                 tau=0.01,
                 actor_lrate=0.001,
                 critic_lrate=0.01,
                 l2_decay=1e-3,
                 batch_size=64,
                 q_update_iter=1,
                 capacity=1000000):

        if not isinstance(state_shape, tuple):
            raise AssertionError('state_shape must be of type <tuple>.')
        elif len(state_shape) == 0:
            raise AssertionError('No state space dimensions provided.')
        elif num_actions == 0:
            raise ValueError('Number of actions must be > 0.')
        elif capacity < batch_size:
            raise ValueError('Replay capacity must be > batch_size.')

        self.batch_size = batch_size
        self.q_update_iter = q_update_iter
        self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions)
        self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate,
                           tau)
        self.critic = Critic(state_shape, num_actions, discount, critic_lrate,
                             tau, l2_decay)
        self.step = 0

    def choose_action(self, state):
        """Returns an action for the agent to perform in the environment."""
        return self.actor.predict(state).flatten()

    def update_buffer(self, s0, a, r, s1, terminal):
        """Updates memory replay buffer with new experience."""
        self.replay_buffer.update(s0, a, r, s1, terminal)

    def update_policy(self):
        """Updates Q-networks using replay memory data + performing SGD"""

        mb = self.replay_buffer.sample(self.batch_size)

        # To update the critic, we need a prediction from target policy
        target_a = self.actor.predict_target(mb[3])
        self.critic.train_fn(mb[0], mb[1], mb[3], target_a, mb[2], mb[4])

        # Updating the actor requires gradients from critic
        action = self.actor.predict(mb[0])
        grads = self.critic.get_action_grads(mb[0], action)
        self.actor.train_fn(mb[0], grads)

        # Every few steps in an episode we update target network weights
        if self.step == self.q_update_iter:
            self.actor.update_target()
            self.critic.update_target()
        self.step = self.step + 1 if self.step != self.q_update_iter else 0
Esempio n. 5
0
def main():
    env_spec = registry[env_name]
    env = gym.make(env_spec["id"])
    ep_max_steps = env_spec["max_episode_steps"]
    agent = DDPG(env.observation_space.shape, env.action_space.shape,
                 env.action_space.low[0], env.action_space.high[0])
    replay_buffer = ReplayBuffer()

    state = env.reset()
    done = False
    ep_timesteps = 0
    ep_reward = 0
    ep_num = 0
    reward_history = []

    for t in range(TOTAL_TIMESTEPS):
        ep_timesteps += 1

        # Select action
        if t < START_TIMESTEP:
            action = env.action_space.sample()
        else:
            action = agent.select_action(np.array(state))

        # Perform action
        next_state, reward, done, _ = env.step(action)
        train_done = done and ep_timesteps < ep_max_steps

        replay_buffer.add(
            TransitionTuple(state, action, next_state, reward,
                            int(train_done)))

        state = next_state
        ep_reward += reward

        if t >= START_TIMESTEP:
            agent.train(replay_buffer, BATCH_SIZE)

        if done:
            reward_history.append(ep_reward)
            print(
                f"[Episode {ep_num+1}, Timestep {t+1}] Total reward: {ep_reward}  Total timesteps: {ep_timesteps}"
            )
            state = env.reset()
            done = False
            ep_timesteps = 0
            ep_reward = 0
            ep_num += 1

        if RENDER:
            env.render()

    # Visualize results
    if OUTPUT_PLOT:
        sns.lineplot(x=np.arange(len(reward_history)) + 1, y=reward_history)
        plt.ylabel("Episode Reward")
        plt.xlabel("Episode Number")
        plt.savefig(OUTPUT_PLOT)
Esempio n. 6
0
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)
Esempio n. 7
0
def main():
    USE_CUDA = torch.cuda.is_available()

    env = gym.make('CartPole-v0')
    dqn = DQN(env.observation_space.shape[0], env.action_space.n)
    if USE_CUDA:
        dqn = dqn.cuda()
    optimizer = optim.RMSprop(dqn.parameters(),
                              lr=0.00025,
                              momentum=0.95,
                              alpha=0.95,
                              eps=0.01)
    epsilon_schedule = get_epsilon_schedule(start=1.0,
                                            end=0.01,
                                            endt=1000,
                                            learn_start=50)
    replay_buffer = ReplayBuffer(capacity=1000)
    agent = DQNAgent(env,
                     dqn,
                     optimizer,
                     epsilon_schedule,
                     replay_buffer,
                     discount_factor=0.99,
                     target_update_rate=10,
                     batch_size=32,
                     learn_start=50)

    agent.train(5000)
    total_reward = agent.play(render=True)
    agent.env.close()
    print('Total Reward: ', total_reward)
Esempio n. 8
0
    def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs):
        self.learner_name = self.learner_name()
        self.interactor_queue = interactor_queue
        self.learner_lock = lock
        self.config = config
        self.env_config = env_config
        self.learner_config = learner_config
        self.bonus_kwargs = bonus_kwargs
        self.kill_threads = False
        self.permit_desync = False
        self.need_frames_notification = threading.Condition()
        self._reset_inspections()
        self.total_frames = 0

        self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"]))
        self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"],  self.config["log_path"])) + "/%s.log" % self.learner_name

        # replay buffer to store data
        self.replay_buffer_lock = threading.RLock()
        self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
                                          np.prod(self.env_config["obs_dims"]),
                                          self.env_config["action_dim"])

        # data loaders pull data from the replay buffer and put it into the tfqueue for model usage
        self.data_loaders = self.make_loader_placeholders()
        queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100
        self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders])
        self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
        self.current_batch = self.tf_queue.dequeue()

        # build the TF graph for the actual model to train
        self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
Esempio n. 9
0
    def __init__(self, interactor_queue, lock, config, env_config,
                 learner_config, **bonus_kwargs):
        self.learner_name = self.learner_name()
        self.interactor_queue = interactor_queue
        self.learner_lock = lock
        self.config = config
        self.env_config = env_config
        self.learner_config = learner_config
        self.bonus_kwargs = bonus_kwargs
        self.kill_threads = False
        self.permit_desync = False
        self.need_frames_notification = threading.Condition()
        self._reset_inspections()
        self.total_frames = 0

        self.save_path = util.create_directory(
            "%s/%s/%s/%s" %
            (self.config["output_root"], self.config["env"]["name"],
             self.config["name"], self.config["save_model_path"]))
        self.log_path = util.create_directory(
            "%s/%s/%s/%s" %
            (self.config["output_root"], self.config["env"]["name"],
             self.config["name"],
             self.config["log_path"])) + "/%s.log" % self.learner_name

        # replay buffer to store data
        self.replay_buffer_lock = threading.RLock()
        self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
                                          np.prod(self.env_config["obs_dims"]),
                                          self.env_config["action_dim"])

        # data loaders pull data from the replay buffer and put it into the tfqueue for model usage
        self.data_loaders = self.make_loader_placeholders()
        queue_capacity = np.ceil(
            1. / self.learner_config["frames_per_update"]
        ) if self.learner_config["frames_per_update"] else 100
        self.tf_queue = tf.FIFOQueue(
            capacity=queue_capacity,
            dtypes=[dl.dtype for dl in self.data_loaders])
        self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
        self.current_batch = self.tf_queue.dequeue()

        # build the TF graph for the actual model to train
        self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model(
        )
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
Esempio n. 10
0
    def train(
        self,
        env: gym.Env,
        agent: Agent,
        network: Network,
        optimizer,
        window_size: int,
        nb_self_play: int,
        num_unroll_steps: int,
        td_steps: int,
        discount: float,
        batch_size: int,
        nb_train_update: int,
        nb_train_epochs: int,
        max_grad_norm: float,
        filename: str,
        ent_c: float,
    ):
        replay_buffer = ReplayBuffer(window_size, batch_size)

        for epoch in range(nb_train_epochs):
            network.eval()
            rewards = []
            for _ in range(nb_self_play):
                game_buffer = self._play_one_game(env, agent)
                # game_buffer.print_buffer()
                replay_buffer.append(game_buffer)
                rewards.append(np.sum(game_buffer.rewards))

            network.train()
            losses = []
            for _ in range(nb_train_update):
                batch = replay_buffer.sample_batch(num_unroll_steps, td_steps,
                                                   discount)
                losses.append(
                    self._update_weights(network, optimizer, batch,
                                         max_grad_norm, ent_c))
            v_loss, r_loss, p_loss, entropy = np.mean(losses, axis=0)
            print(
                f"Epoch[{epoch+1}]: Reward[{np.mean(rewards)}], Loss: V[{v_loss:.6f}]/R[{r_loss:.6f}]/P[{p_loss:.6f}]/E[{entropy:.6f}]"
            )

            if (epoch + 1) % 10 == 0:
                agent.save_model(filename)
Esempio n. 11
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.2  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf
Esempio n. 12
0
class DuelingDQN(BaseAgent):
    def __init__(self, env):
        self.buffer_size = 20000
        self.batch_size = 64
        self.tau = 1
        self.gamma = 0.95
        self.learning_rate = 0.001

        # Exploration Parameters
        self.E_start = 1
        self.E_end = 0.1
        self.E_decay = 0.002
        self.episode = 0

        self.env = env
        self.os = self.env.observation_space
        #         self.acs = self.env.action_space

        self.edim = len(self.os.high)
        self.adim = self.env.action_space.n

        self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1)

        self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate)
        self.target = DuelingDQN_Model(self.edim, self.adim,
                                       self.learning_rate)

        self.initial_weights = self.local.model.get_weights()
        self.target.model.set_weights(self.initial_weights)

    def act(self, state, testing):
        state = state.reshape([1, -1])
        actionQs = self.local.model.predict(state)

        action = np.argmax(actionQs)

        epsilon = self.E_end + (self.E_start - self.E_end) * np.exp(
            -self.E_decay * self.episode)
        if (not testing):
            if (np.random.rand() < epsilon):
                action = np.random.choice(self.adim)

#         action = np.array([action])

        return action

    def learn(self, state, action, reward, next_state, done, testing):

        # Skip all learning during testing
        if (testing):
            return

        act_index = action

        self.buffer.add(state, act_index, reward, next_state, done)

        if (done):
            self.episode += 1

            # TODO When upgrading to RDPG this should be per-episode based

            states, actions, rewards, next_states, dones = self.buffer.batch(
                self.batch_size)

            actions.astype(int)
            actions = actions.reshape([-1])
            rewards = rewards.reshape([-1])
            dones = dones.reshape([-1])
            # Bellman equation
            target_Q = rewards + self.gamma * np.amax(
                self.target.model.predict_on_batch(next_states),
                axis=1) * (1 - dones)

            self.local.train([states, actions, target_Q])

            self.soft_update(self.target, self.local)

    def soft_update(self, target, local):
        local_weights = np.array(local.model.get_weights())
        target_weights = np.array(target.model.get_weights())

        new_target_weights = (
            1 - self.tau) * target_weights + self.tau * local_weights

        target.model.set_weights(new_target_weights)

    def reset(self):
        shuffle_weights(self.local.model, self.initial_weights)
        self.target.model.set_weights(self.local.model.get_weights())
        self.episode = 0
Esempio n. 13
0
def train(args):
    env, discrete = init_environment(env_name=args.env_name)
    thresh = env.spec.reward_threshold
    print("Starting training!  Need {} to solve".format(thresh))
    print(env)
    # print(env.observation_space["observation"].shape[0])
    seed_random(env, args.rand_seed)
    device = init_device()
    writer = init_logger(log_dir=args.log_dir)

    sac = SAC(env, device, at=args.alph_tune, dis=discrete).to(device)
    sac.init_opt(lr=args.learning_rate)
    reward_history = []
    eval_history = []
    reward_cum = 0
    max_reward = -float("inf")
    act_size = sac.actor.action_space
    replay = ReplayBuffer(args.buff_size, sac.actor.state_space, act_size)
    step = 0
    episode = 0
    while step < args.steps and episode < args.num_episodes:
        # Reset environment and record the starting state
        state = env.reset()
        reward_cum = 0
        done = False
        time = 0
        while not done and time < args.time_limit:
            state = torch.from_numpy(state).float().to(device)
            action = sac.get_action(state)
            next_state, reward, done, _ = env.step(action)
            if sac.discrete:
                action = get_one_hot_np(action, sac.soft_q1.action_space)
            replay.store(state.cpu(), action, next_state, reward, done)
            state = next_state
            reward_cum += reward
            step += 1
            time += 1
            if len(replay) > args.batch_size:
                update_SAC(
                    sac,
                    replay,
                    step,
                    writer,
                    batch_size=args.batch_size,
                )

            if step > 0 and step % args.eval_freq == 0:
                print("Evaluating")
                sac.eval()
                num = step / args.eval_freq
                curr_reward = evaluate_SAC(args, env, sac, writer, step)
                eval_history.append((num, curr_reward))
                if curr_reward > max_reward:
                    print("Saving model...")
                    max_reward = curr_reward
                    sac.save()
                print("Steps {} Eval Reward {:.2f}".format(step, curr_reward))
                sac.train()

        # Calculate score to determine when the environment has been solved
        reward_history.append(reward_cum)
        mean_score = np.mean(reward_history[-100:])
        if writer is not None:
            writer.add_scalar("stats/reward", reward_cum, step)
            writer.add_scalar("stats/avg_reward", mean_score, step)

        print(
            "Episode {} Steps {} Reward {:.2f} Avg reward {:.2f}".format(
                episode, step, reward_history[-1], mean_score
            )
        )

        episode += 1
        if thresh is not None and mean_score > thresh:
            print("Solved after {} episodes!".format(episode))
            print("And {} environment steps".format(step))
            break

    fname = "results.out"
    data = np.array(reward_history)
    np.savetxt(fname, data)
    plot_success(reward_history)
Esempio n. 14
0
num_episode = 5000
epsilon = 1

env = gym.make('Pendulum-v0')
num_action = env.action_space.shape[0]
num_state = env.observation_space.shape[0]

np.random.seed(123)

sess = tf.Session()
from keras import backend as K
K.set_session(sess)

actor = ActorNetwork(sess, num_state, num_action, batch_size, tau, actor_alpha)
critic = CriticNetwork(sess, num_state, num_action, batch_size, tau, critic_alpha)
buff = ReplayBuffer(buffer_size)

with open('actor_model.json', 'w') as json_file:
    json_file.write(actor.model.to_json())
with open('critic_model.json', 'w') as json_file:
    json_file.write(critic.model.to_json())

print 'start training'
best_r = -10000

actor.update_target_network()
critic.update_target_network()

try:
    for i in range(num_episode):
        total_reward = 0
Esempio n. 15
0
def challenger_round():
    challengers = []
    leaders = []
    leader_checkpoints = os.listdir(LEADER_DIR)
    # Need to share the same schedule with all challengers, so they all anneal
    # at same rate
    epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES)
    for i in xrange(NUM_LEADERS):
        challenger = try_gpu(
            DQNAgent(6,
                     epsilon_schedule,
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM))
        if i < len(leader_checkpoints):
            leader = try_gpu(
                DQNAgent(6, LinearSchedule(0.1, 0.1, 500000),
                         OBSERVATION_MODE))
            leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i])
            print "LOADING CHECKPOINT: {}".format(leader_path)
            challenger.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
            leader.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
        else:
            leader = RandomAgent(6)
            print "INITIALIZING NEW CHALLENGER AND LEADER"
        challengers.append(challenger)
        leaders.append(leader)

    if CHALLENGER_DIR is not None:
        challengers = []
        # Load in all of the leaders
        for checkpoint in os.listdir(CHALLENGER_DIR):
            path = os.path.join(CHALLENGER_DIR, checkpoint)
            print "LOADING FROM CHALLENGER_DIR: {}".format(path)
            challenger = try_gpu(
                DQNAgent(6,
                         LinearSchedule(0.05, 0.05, 1),
                         CHALLENGER_OBSERVATION_MODE,
                         lr=LR,
                         max_grad_norm=GRAD_CLIP_NORM,
                         name=checkpoint))
            challenger.load_state_dict(
                torch.load(path, map_location=lambda storage, loc: storage))
            challengers.append(challenger)

    challenger = EnsembleDQNAgent(challengers)
    leader = EnsembleDQNAgent(leaders)
    if OPPONENT is not None or HUMAN:
        leader = NoOpAgent()
    replay_buffer = ReplayBuffer(1000000)
    rewards = collections.deque(maxlen=1000)
    frames = 0  # number of training frames seen
    episodes = 0  # number of training episodes that have been played
    with tqdm(total=TRAIN_FRAMES) as progress:
        # Each loop completes a single episode
        while frames < TRAIN_FRAMES:
            states = env.reset()
            challenger.reset()
            leader.reset()
            episode_reward = 0.
            episode_frames = 0
            # Each loop completes a single step, duplicates _evaluate() to
            # update at the appropriate frame #s
            for _ in xrange(MAX_EPISODE_LENGTH):
                frames += 1
                episode_frames += 1
                action1 = challenger.act(states[0])
                action2 = leader.act(states[1])
                next_states, reward, done = env.step(action1, action2)
                episode_reward += reward

                # NOTE: state and next_state are LazyFrames and must be
                # converted to np.arrays
                replay_buffer.add(
                    Experience(states[0], action1._action_index, reward,
                               next_states[0], done))
                states = next_states

                if len(replay_buffer) > 50000 and \
                        frames % 4 == 0:
                    experiences = replay_buffer.sample(32)
                    challenger.update_from_experiences(experiences)

                if frames % 10000 == 0:
                    challenger.sync_target()

                if frames % SAVE_FREQ == 0:
                    # TODO: Don't access internals
                    for agent in challenger._agents:
                        path = os.path.join(LEADER_DIR,
                                            agent.name + "-{}".format(frames))
                        print "SAVING CHECKPOINT TO: {}".format(path)
                        torch.save(agent.state_dict(), path)
                    #path = os.path.join(
                    #        LEADER_DIR, challenger.name + "-{}".format(frames))
                    #torch.save(challenger.state_dict(), path)

                if frames >= TRAIN_FRAMES:
                    break

                if done:
                    break

            if episodes % 300 == 0:
                print "Evaluation: {}".format(
                    evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN))
            print "Episode reward: {}".format(episode_reward)
            episodes += 1
            rewards.append(episode_reward)
            stats = challenger.stats
            stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards)
            stats["Num Episodes"] = episodes
            stats["Replay Buffer Size"] = len(replay_buffer)
            progress.set_postfix(stats, refresh=False)
            progress.update(episode_frames)
            episode_frames = 0
Esempio n. 16
0
value_criterion = nn.MSELoss()
soft_q_criterion1 = nn.MSELoss()
soft_q_criterion2 = nn.MSELoss()

value_lr = 3e-4
soft_q_lr = 3e-4
policy_lr = 3e-4

value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr)
soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr)
soft_q_optimizer2 = optim.Adam(soft_q_net2.parameters(), lr=soft_q_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)


replay_buffer_size = 1000000
replay_buffer = ReplayBuffer(replay_buffer_size)


max_frames = 40000
max_steps = 500
frame_idx = 0
rewards = []
batch_size = 128

while frame_idx < max_frames:
    state = env.reset()
    episode_reward = 0

    for step in range(max_steps):
        if frame_idx > 1000:
            action = policy_net.get_action(state).detach()
Esempio n. 17
0
class Learner(object):
    """
    Generic object which runs the main training loop of anything that trains using
    a replay buffer. Handles updating, logging, saving/loading, batching, etc.
    """
    def __init__(self, interactor_queue, lock, config, env_config,
                 learner_config, **bonus_kwargs):
        self.learner_name = self.learner_name()
        self.interactor_queue = interactor_queue
        self.learner_lock = lock
        self.config = config
        self.env_config = env_config
        self.learner_config = learner_config
        self.bonus_kwargs = bonus_kwargs
        self.kill_threads = False
        self.permit_desync = False
        self.need_frames_notification = threading.Condition()
        self._reset_inspections()
        self.total_frames = 0

        self.save_path = util.create_directory(
            "%s/%s/%s/%s" %
            (self.config["output_root"], self.config["env"]["name"],
             self.config["name"], self.config["save_model_path"]))
        self.log_path = util.create_directory(
            "%s/%s/%s/%s" %
            (self.config["output_root"], self.config["env"]["name"],
             self.config["name"],
             self.config["log_path"])) + "/%s.log" % self.learner_name

        # replay buffer to store data
        self.replay_buffer_lock = threading.RLock()
        self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
                                          np.prod(self.env_config["obs_dims"]),
                                          self.env_config["action_dim"])

        # data loaders pull data from the replay buffer and put it into the tfqueue for model usage
        self.data_loaders = self.make_loader_placeholders()
        queue_capacity = np.ceil(
            1. / self.learner_config["frames_per_update"]
        ) if self.learner_config["frames_per_update"] else 100
        self.tf_queue = tf.FIFOQueue(
            capacity=queue_capacity,
            dtypes=[dl.dtype for dl in self.data_loaders])
        self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
        self.current_batch = self.tf_queue.dequeue()

        # build the TF graph for the actual model to train
        self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model(
        )
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    ## Mandatory functions to override
    def learner_name(self):
        raise Exception('unimplemented: learner_name')

    def make_loader_placeholders(self):
        raise Exception('unimplemented: make_loader_placeholders')

    def make_core_model(self):
        raise Exception('unimplemented: make_core_model')

    ## Optional functions to override
    def initialize(self):
        warnings.warn('unimplemented: initialize')

    def resume_from_checkpoint(self, epoch):
        warnings.warn('unimplemented: resume_from_checkpoint')

    def checkpoint(self):
        warnings.warn('unimplemented: checkpoint')

    def backup(self):
        warnings.warn('unimplemented: backup')

    ## Internal functions
    def _start(self):
        # fetch data from the interactors to pre-fill the replay buffer
        self.prefetch_thread = threading.Thread(
            target=self._poll_interactors,
            args=(
                True,
                self.learner_config["frames_before_learning"],
            ))
        self.prefetch_thread.start()
        self.prefetch_thread.join()

        # start the interactor and data loader
        self.data_load_thread = threading.Thread(target=self._run_enqueue_data)
        self.data_load_thread.start()

        # initialize the learner, pretraining if needed
        if self.config["resume"]: self._resume_from_checkpoint()
        else: self._initialize()

        # re-sync everything, and start up interactions with the environment
        self.interactor_poll_thread = threading.Thread(
            target=self._poll_interactors)
        self.interactor_poll_thread.start()

        # start the clock
        self._last_checkpoint_time = time.time()

    def _learn(self,
               permit_desync=False,
               log=True,
               checkpoint=True,
               backup=True):
        # this is to keep the frames/update synced properly
        if self.learner_config[
                "frames_per_update"] is not False and not permit_desync:
            if not self._have_enough_frames():
                with self.need_frames_notification:
                    self.need_frames_notification.notify()
                return

        # log
        if log and (self.update_i +
                    1) % self.learner_config["log_every_n"] == 0:
            self._log()

        # checkpoint
        if checkpoint and (self.update_i +
                           1) % self.learner_config["epoch_every_n"] == 0:
            self._checkpoint()

        # backup
        if backup and (self.update_i +
                       1) % self.learner_config["backup_every_n"] == 0:
            self._backup()

        # train
        self._training_step()

    def _have_enough_frames(self):
        gathered_frames = self.total_frames - self.learner_config[
            "frames_before_learning"]
        return gathered_frames > self.learner_config[
            "frames_per_update"] * self.update_i

    def _initialize(self):
        self.epoch = 0
        self.update_i = 0
        self.hours = 0
        self._last_checkpoint_time = time.time()

        self.initialize()

        if self.learner_config["pretrain_n"]: self._pretrain()
        self._checkpoint()

    def _pretrain(self):
        for _ in range(self.learner_config["pretrain_n"]):
            self._learn(permit_desync=True, checkpoint=False, backup=False)
        self.epoch = 0
        self.update_i = 0

    def _resume_from_checkpoint(self):
        epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid)
        if not self.config['keep_all_replay_buffers']:
            util.wipe_all_but_largest_epoch_in_dir(self.save_path,
                                                   self.core.saveid)
        if epoch is False:
            raise Exception("Tried to reload but no model found")
        with self.learner_lock:
            self.core.load(self.sess, self.save_path, epoch)
            self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run(
                [
                    self.core.epoch_n, self.core.update_n, self.core.frame_n,
                    self.core.hours
                ])
        with self.replay_buffer_lock:
            self.replay_buffer.load(self.save_path,
                                    '%09d_%s' % (epoch, self.learner_name))
        self.resume_from_checkpoint(epoch)

    def _log(self):
        logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % (
            time.time() - self._log_time, self.hours, self.epoch,
            self.update_i + 1, self.total_frames) + ', '.join([
                "%8f" % x for x in (self.running_total / self.denom).tolist()
            ])
        print("%s\t%s" % (self.learner_name, logstring))
        with open(self.log_path, "a") as f:
            f.write(logstring + "\n")
        self._reset_inspections()

    def _reset_inspections(self):
        self.running_total = 0.
        self.denom = 0.
        self._log_time = time.time()

    def _checkpoint(self):
        self.checkpoint()
        self.epoch += 1
        self.hours += (time.time() - self._last_checkpoint_time) / 3600.
        self._last_checkpoint_time = time.time()
        self.core.update_epoch(self.sess, self.epoch, self.update_i,
                               self.total_frames, self.hours)
        with self.learner_lock:
            self.core.save(self.sess, self.save_path)

    def _backup(self):
        self.backup()
        if not self.learner_config['keep_all_replay_buffers']:
            util.wipe_all_but_largest_epoch_in_dir(self.save_path,
                                                   self.core.saveid)
        with self.learner_lock:
            self.core.save(self.sess, self.save_path, self.epoch)
        with self.replay_buffer_lock:
            self.replay_buffer.save(
                self.save_path, '%09d_%s' % (self.epoch, self.learner_name))

    def _training_step(self):
        train_ops = tuple([
            op for op, loss in zip(self.train_ops, self.train_losses)
            if loss is not None
        ])
        outs = self.sess.run(train_ops + self.inspect_losses)
        self.running_total += np.array(outs[len(train_ops):])
        self.denom += 1.
        self.update_i += 1

    def _poll_interactors(self,
                          continuous_poll=False,
                          frames_before_terminate=None):
        # poll the interactors for new frames.
        # the synced_condition semaphore prevents this from consuming too much CPU
        while not self.kill_threads:
            if self.learner_config[
                    "frames_per_update"] is not False and not continuous_poll:
                with self.need_frames_notification:
                    self.need_frames_notification.wait()
            while not self.interactor_queue.empty():
                new_frames = self.interactor_queue.get()
                self._add_frames(new_frames)
                if frames_before_terminate and self.total_frames >= frames_before_terminate:
                    return

    def _add_frames(self, frames):
        with self.replay_buffer_lock:
            for frame in frames:
                self.replay_buffer.add_replay(*frame)
            self.total_frames = self.replay_buffer.count
        return self.total_frames

    def _run_enqueue_data(self):
        while not self.kill_threads:
            data = self.replay_buffer.random_batch(
                self.learner_config["batch_size"])
            self.sess.run(self.enqueue_op,
                          feed_dict=dict(list(zip(self.data_loaders, data))))

    def _kill_threads(self):
        self.kill_threads = True
Esempio n. 18
0
DISPLAY_Q_VALUES = True
DISPLAY_VAL_CHART = True
DISPLAY_HEATMAP = True

game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS)
print("The environment has the following {} actions: {}".format(
    game_wrapper.env.action_space.n,
    game_wrapper.env.unwrapped.get_action_meanings()))

MAIN_DQN = build_q_network(game_wrapper.env.action_space.n,
                           LEARNING_RATE,
                           input_shape=INPUT_SHAPE)
TARGET_DQN = build_q_network(game_wrapper.env.action_space.n,
                             input_shape=INPUT_SHAPE)

replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE)
agent = Agent(MAIN_DQN,
              TARGET_DQN,
              replay_buffer,
              game_wrapper.env.action_space.n,
              input_shape=INPUT_SHAPE)

print('Loading agent...')
agent.load(RESTORE_PATH)


def display_nparray(arr, maxwidth=500):
    assert len(arr.shape) == 3

    height, width, _channels = arr.shape
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > self.batch_size:
            if self.t_step % self.update_every == 0:
                for _ in range(self.n_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update epsilon noise value
        self.eps = max(self.eps_end, self.eps_decay*self.eps)
        # self.eps = self.eps - (1/self.eps_decay)
        # if self.eps < self.eps_end:
        #     self.eps = self.eps_end

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Esempio n. 20
0
def train(env, estimator, target_network, num_episodes=1000,
                    replay_memory_size=500000,
                    frame_history_len=4,
                    save_every=10,
                    update_every=1000,
                    discount=0.99, epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=50000,
                    batch_size=32, record_every=50):
    """
    deep q learning algorithm
    :param env: openAI gym environment
    :param estimator: estimator model for predicting values
    :param target_network:
    :param num_episodes: number of episodes to run
    :param replay_memory_size: size of replay memory
    :param update_every: copy params from estimator into target estimator after this many steps
    :param discount: discount factor
    :param epsilon_start: starting epsilon value
    :param epsilon_end: ending epsilon value
    :param batch_size: 32 lol
    :param record_every: record a video every N episodes
    :return:
    """

    # Load previous state here
    replay_memory = ReplayBuffer(replay_memory_size, frame_history_len)

    # epsilon delay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    loss_func = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(estimator.parameters())

    policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS))

    env = Monitor(env, directory="./monitor",
                  resume=True,
                  video_callable=lambda count: count % record_every == 0)

    total_t = 0
    pbar = tqdm(range(num_episodes))
    pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0))

    for ep in pbar:

        state = env.reset()  # 210 x 160 x 4
        state = process_state(state)  # 94 x 94 x 3
        episode_loss = 0
        episode_reward = 0
        episode_t = 0

        for t in itertools.count():
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            last_idx = replay_memory.store_frame(state)

            recent_observations = replay_memory.encode_recent_observation()

            action_dist = policy(recent_observations, epsilon)
            action_dist = action_dist.squeeze(0).numpy()
            action = np.random.choice(np.arange(len(action_dist)), p=action_dist)

            next_state, reward, done, _ = env.step(action)
            reward = max(-1.0, min(reward, 1.0))

            episode_reward += reward

            replay_memory.store_effect(last_idx, action, reward, done)
            next_state = process_state(next_state)

            state = next_state

            if replay_memory.can_sample(batch_size):
                obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size)
                obs_batch = torch.from_numpy(obs_batch).float()
                obs_batch = obs_batch.to(device)
                act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0
                rew_batch = torch.from_numpy(rew_batch).to(device)
                next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0
                not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

                state_values = estimator(obs_batch)  # b x VALID_ACTIONS
                state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1))  # b x 1

                next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0]
                next_state_values = not_done_mask * next_state_values_max

                expected_q_value = (next_state_values * discount) + rew_batch

                # bellman_error = expected_q_value - state_action_values.squeeze(1)
                #
                # clipped_bellman_error = bellman_error.clamp(-1, 1)
                #
                # d_error = clipped_bellman_error * -1.0

                loss = loss_func(state_action_values, expected_q_value.unsqueeze(1))
                episode_loss += loss

                # state_action_values.backward(d_error.data.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

            total_t += 1
            episode_t = t

        pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer))
        if total_t % update_every == 0:
            copy_model_params(estimator, target_network)

        # save checkpoint
        if ep % save_every == 0:
            torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt')

    env.close()
Esempio n. 21
0
class Agent(object):
    """DQN Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 replay_buffer_size=int(1e5),
                 batch_size=64,
                 discount_factor=0.99,
                 soft_update=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 **kwargs):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            replay_buffer_size (int): Size of replay buffer
            batch_size (int): Size of experience batches during training
            discount_factor (float): Discount factor (gamma)
            soft_update (float): Soft update coefficient (tau)
            learning_rate (float): Learning rate (alpha)
            update_every (int): Steps between updating the network
            **kwargs: Arguments describing the QNetwork
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        # Parameters
        self.batch_size = batch_size
        """Size of experience batches during training"""

        self.discount_factor = discount_factor
        """Discount factor (gamma)"""

        self.soft_update = soft_update
        """Soft update coefficient (tau)"""

        self.update_every = update_every
        """Steps between updating the network"""

        # Q Networks
        self.target_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Target Q-Network"""

        self.local_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Local Q-Network"""

        self.optimizer = optim.Adam(self.local_network.parameters(),
                                    lr=learning_rate)
        """Optimizer used when training the Q-network."""

        # Memory
        self.memory = ReplayBuffer(replay_buffer_size, batch_size, device)

        # Time step
        self.t_step = 0
        """Current time step"""

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        self.local_network.save_weights(path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        self.local_network.load_weights(path)

    def act(self, state, eps=0.):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state
            eps (float): Probability of selecting random action (epsilon)
            
        Returns:
            int: Epsilon-greedily selected action
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.local_network.eval()
        with torch.no_grad():
            action_values = self.local_network(state)
        self.local_network.train()

        # Select action epsilon-greedily
        if random.random() > eps:
            return np.argmax(action_values.cpu().detach().numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.

        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn if at update_every steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # Check that we have enough stored experiences
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Update Q-network using given experiences

        Args:
            experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
                SARS'+done tuple
        """
        states, actions, rewards, next_states, dones = experiences

        # Predicted Q values from target model for next states
        # (NB. torch.max returns tuple (max, argmax)
        q_target_next = self.target_network(next_states).max(dim=1,
                                                             keepdim=True)[0]

        # Computed target Q values for current state
        q_target = rewards + self.discount_factor * q_target_next * (1 - dones)

        # Predicted Q values from local model for current state
        q_local = self.local_network(states).gather(dim=1, index=actions)

        loss = F.mse_loss(q_local, q_target)

        # Update local network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        soft_update(self.local_network, self.target_network, self.soft_update)
Esempio n. 22
0
game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS)
print("The environment has the following {} actions: {}".format(
    game_wrapper.env.action_space.n,
    game_wrapper.env.unwrapped.get_action_meanings()))

writer = tf.summary.create_file_writer(TENSORBOARD_DIR)

MAIN_DQN = build_q_network(game_wrapper.env.action_space.n,
                           LEARNING_RATE,
                           input_shape=INPUT_SHAPE)
TARGET_DQN = build_q_network(game_wrapper.env.action_space.n,
                             input_shape=INPUT_SHAPE)

replay_buffer = ReplayBuffer(size=MEM_SIZE,
                             input_shape=INPUT_SHAPE,
                             use_per=USE_PER)
agent = Agent(MAIN_DQN,
              TARGET_DQN,
              replay_buffer,
              game_wrapper.env.action_space.n,
              input_shape=INPUT_SHAPE,
              batch_size=BATCH_SIZE,
              use_per=USE_PER)

if LOAD_FROM is None:
    frame_number = 0
    rewards = []
    loss_list = []
else:
    print('Loading from', LOAD_FROM)
Esempio n. 23
0
def main(config, max_samples):
    get_env_parameters(config)
    log_dir = "logs/scalars/" + datetime.datetime.now().strftime(
        "%Y%m%d-%H%M%S")
    file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
    file_writer.set_as_default()
    config['log_dir'] = log_dir
    ray.init()
    parameter_server = ParameterServer.remote(config)
    replay_buffer = ReplayBuffer.remote(config)
    learner = Learner.remote(config, replay_buffer, parameter_server)
    training_actor_ids = []
    eval_actor_ids = []

    learner.start_learning.remote()

    # Create training actors
    for i in range(config["num_workers"]):
        eps = config["max_eps"] * i / config["num_workers"]
        actor = Actor.remote("train-" + str(i), replay_buffer,
                             parameter_server, config, eps)
        actor.sample.remote()
        training_actor_ids.append(actor)

    # Create eval actors
    for i in range(config["eval_num_workers"]):
        eps = 0
        actor = Actor.remote("eval-" + str(i), replay_buffer, parameter_server,
                             config, eps, True)
        eval_actor_ids.append(actor)

    total_samples = 0
    best_eval_mean_reward = np.NINF
    eval_mean_rewards = []
    while total_samples < max_samples:
        tsid = replay_buffer.get_total_env_samples.remote()
        new_total_samples = ray.get(tsid)
        if (new_total_samples - total_samples >=
                config["timesteps_per_iteration"]):
            total_samples = new_total_samples
            print("Total samples:", total_samples)
            parameter_server.set_eval_weights.remote()
            eval_sampling_ids = []
            for eval_actor in eval_actor_ids:
                sid = eval_actor.sample.remote()
                eval_sampling_ids.append(sid)
            eval_rewards = ray.get(eval_sampling_ids)
            print("Evaluation rewards: {}".format(eval_rewards))
            eval_mean_reward = np.mean(eval_rewards)
            eval_mean_rewards.append(eval_mean_reward)
            print("Mean evaluation reward: {}".format(eval_mean_reward))
            tf.summary.scalar('Mean evaluation reward',
                              data=eval_mean_reward,
                              step=total_samples)
            if eval_mean_reward > best_eval_mean_reward:
                print("Model has improved! Saving the model!")
                best_eval_mean_reward = eval_mean_reward
                parameter_server.save_eval_weights.remote()

    print("Finishing the training.")
    for actor in training_actor_ids:
        actor.stop.remote()
    learner.stop.remote()
Esempio n. 24
0
class DDPG:
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

    def step(self, states, actions, rewards, next_states, dones):

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

        if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0):

            for i in range(self.config.EPOCH):
                experiences = self.memory.sample(self.config.BATCH_SIZE)
                self.learn(experiences)

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.critic_target(next_states,
                                            self.actor_target(next_states))
        Q_targets = rewards + (self.config.GAMMA * Q_targets_next *
                               (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic_local(states,
                                        self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 25
0
def train(sess, env, actor, critic, noise, reward, discrete):
    # set up summary writer
    summary_write = tf.summary.FileWriter("ddpg_summary")

    sess.run(tf.global_variables_initializer())

    # initialize target and critic network
    actor.update_target_network()
    critic.update_target_network()

    # initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_reward = 0
        ep_ave_max_q = 0

        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):
            if RENDER_ENV:
                env.render()
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # Add exploration noise
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

            # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            s2, r, terminal, info = env.step(action)

            # Choose reward type
            ep_reward += r

            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)

            # Adding experience to memory
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targes
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
                ep_ave_max_q += np.max(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            # Set previous state for next step
            s = s2
            if terminal:
                # Reward system for episode
                #
                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay
                for step in episode_buffer:
                    replay_buffer.add(np.reshape(step[0], (actor.s_dim, )),
                                      np.reshape(step[1], (actor.a_dim, )),
                                      step[2], step[3],
                                      np.reshape(step[4], (actor.s_dim, )))

                # summary = tf.summary()
                # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward))
                # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j)))
                # summary_writer.add_summary(summary, i)

                # summary_writer.flush()

                if i != 0:
                    print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % (
                        int(ep_reward), i, (ep_ave_max_q / float(i)))
                break
Esempio n. 26
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.2  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Save reward
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            # Keeping track of the score
            self.score = self.total_reward / float(
                self.count) if self.count else 0.0
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 27
0
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 replay_buffer_size=int(1e5),
                 batch_size=64,
                 discount_factor=0.99,
                 soft_update=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 **kwargs):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            replay_buffer_size (int): Size of replay buffer
            batch_size (int): Size of experience batches during training
            discount_factor (float): Discount factor (gamma)
            soft_update (float): Soft update coefficient (tau)
            learning_rate (float): Learning rate (alpha)
            update_every (int): Steps between updating the network
            **kwargs: Arguments describing the QNetwork
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        # Parameters
        self.batch_size = batch_size
        """Size of experience batches during training"""

        self.discount_factor = discount_factor
        """Discount factor (gamma)"""

        self.soft_update = soft_update
        """Soft update coefficient (tau)"""

        self.update_every = update_every
        """Steps between updating the network"""

        # Q Networks
        self.target_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Target Q-Network"""

        self.local_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Local Q-Network"""

        self.optimizer = optim.Adam(self.local_network.parameters(),
                                    lr=learning_rate)
        """Optimizer used when training the Q-network."""

        # Memory
        self.memory = ReplayBuffer(replay_buffer_size, batch_size, device)

        # Time step
        self.t_step = 0
        """Current time step"""
Esempio n. 28
0
    processes = [
        multiprocessing.Process(target=environment_process,
                                args=(port, ),
                                daemon=True) for i in range(NUM_ENVIRONMENTS)
    ]
    for p in processes:
        p.start()

    step = 0
    start = time.time()
    recent_steps = []
    recent_total_reward = []
    recent_collisions = []
    recent_values = []
    recent_stats = [[] for _ in range(5)]
    replay_buffer = ReplayBuffer(REPLAY_MAX)

    while True:

        # Read request and process.
        request = socket.recv_pyobj()
        instruction = request[0]
        instruction_data = request[1:]

        if instruction == 'CALL_GENERATOR':

            if len(replay_buffer) < REPLAY_MIN:
                if TASK in ['DriveStraight', 'DriveHard']:
                    params = rand_macro_action_set_drive_straight(7)
                else:
                    params = rand_macro_action_set(8, 3)
Esempio n. 29
0
def train(env, config):
    """
    Execute training of Soft Actor Critic
    """

    timesteps_elapsed = 0
    episodes_elapsed = 0

    STATE_SIZE = env.observation_space.shape[0]
    ACTION_SIZE = env.action_space.n

    #policy_net = Net(sizes=(STATE_SIZE, *config["hidden_size"], ACTION_SIZE)).to(device=config["device"])
    policy_net = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"])
    policy_net.apply(init_weights)

    Q_net1 = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"])
    Q_net1.apply(init_weights)
    Q_net2 = ConvNet(nChannels=1, nOut=ACTION_SIZE).to(config["device"])
    Q_net2.apply(init_weights)
    Q_target_net1 = copy.deepcopy(Q_net1)
    Q_target_net2 = copy.deepcopy(Q_net2)
    Q_target_net1.freeze()
    Q_target_net2.freeze()

    log_alpha = nn.Parameter(
        torch.tensor([math.log(config["alpha"])], device=config["device"]))
    entropy_target = -math.log(1 / ACTION_SIZE) * config[
        "target_entropy_ratio"]  # that is, maximum entropy times a ratio

    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=config["learning_rate_policy"])
    optimizer_q = torch.optim.Adam(list(Q_net1.parameters()) +
                                   list(Q_net2.parameters()),
                                   lr=config["learning_rate_value"])
    optimizer_alpha = torch.optim.Adam([log_alpha],
                                       lr=config["learning_rate_alpha"],
                                       eps=1e-4)

    replay_buffer = ReplayBuffer(config["buffer_capacity"])
    n_step_buffer = NstepBuffer(config["n_steps"])

    eval_returns_all = []
    eval_times_all = []

    train_policy = False

    start_time = time.time()
    with tqdm(total=config["max_timesteps"]) as pbar:
        while timesteps_elapsed < config["max_timesteps"]:
            elapsed_seconds = time.time() - start_time
            if elapsed_seconds > config["max_time"]:
                pbar.write("Training ended after {}s.".format(elapsed_seconds))
                break

            episode_timesteps, _ = play_episode(
                env,
                policy_net=policy_net,
                Q_net1=Q_net1,
                Q_net2=Q_net2,
                Q_target_net1=Q_target_net1,
                Q_target_net2=Q_target_net2,
                optimizer_policy=optimizer_policy,
                optimizer_q=optimizer_q,
                optimizer_alpha=optimizer_alpha,
                log_alpha=log_alpha,
                entropy_target=entropy_target,
                gamma=config["gamma"],
                replay_buffer=replay_buffer,
                n_step_buffer=n_step_buffer,
                train=True,
                train_policy=train_policy,
                render=config["render"],
                max_steps=config["episode_length"],
                steps_init_training=config["steps_init_training"],
                steps_per_learning_update=config["steps_per_learning_update"],
                batch_size=config["batch_size"],
                device=config["device"])
            timesteps_elapsed += episode_timesteps
            episodes_elapsed += 1
            pbar.update(episode_timesteps)

            if timesteps_elapsed % config[
                    "train_policy_freq"] < episode_timesteps:
                train_policy = True
            else:
                train_policy = False

            if timesteps_elapsed > config["steps_init_training"]:
                if (timesteps_elapsed - config["steps_init_training"]
                    ) % config["target_update_freq"] < episode_timesteps:
                    Q_target_net1.soft_update(Q_net1, 0.8)
                    Q_target_net2.soft_update(Q_net2, 0.8)

            if timesteps_elapsed % config["eval_freq"] < episode_timesteps:
                eval_returns = 0
                for _ in range(config["eval_episodes"]):
                    _, episode_return = play_episode(
                        env,
                        policy_net,
                        Q_net1,
                        Q_net2,
                        Q_target_net1,
                        Q_target_net2,
                        optimizer_policy,
                        optimizer_q,
                        optimizer_alpha=optimizer_alpha,
                        log_alpha=log_alpha,
                        entropy_target=entropy_target,
                        gamma=config["gamma"],
                        replay_buffer=replay_buffer,
                        n_step_buffer=n_step_buffer,
                        train=False,
                        train_policy=train_policy,
                        render=config["render"],
                        max_steps=config["episode_length"],
                        batch_size=config["batch_size"],
                        device=config["device"])
                    eval_returns += episode_return
                eval_returns = eval_returns / config["eval_episodes"]
                eval_returns_all.append(eval_returns)
                pbar.write(
                    "Evaluation at timestep {} and episode {} returned a mean returns of {}"
                    .format(timesteps_elapsed, episodes_elapsed, eval_returns))

                if eval_returns >= config["target_return"]:
                    pbar.write(
                        "Reached return {} >= target return of {}".format(
                            eval_returns, config["target_return"]))
                    break

    print("Saving policy to {}".format(config["save_filename"]))
    torch.save(policy_net, config["save_filename"])

    return np.array(eval_returns_all)
Esempio n. 30
0
class Agent(object):
    """DDPG Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)

    def reset(self):
        """Reset state of agent."""
        self.noise.reset()

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        checkpoint = torch.load(path)
        self.actor_local.load_state_dict(checkpoint['actor_local'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic_local.load_state_dict(checkpoint['critic_local'])
        self.critic_target.load_state_dict(checkpoint['critic_target'])

    def act(self, state, add_noise=True):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state

        Returns:
            action (np.ndarray): Action tuple
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().detach().numpy()

        # Resume training mode
        self.actor_local.train()

        # Add noise if exploring
        if add_noise:
            action += self.noise.sample()
            # The noise might take us out of range
            action = np.clip(action, -1, 1)

        return action

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.
        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn as soon as we have enough stored experiences
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Learn from batch of experiences."""
        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)

        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))

        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)
        self.critic_optimizer.step()
        # endregion

        # region Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion

        # Update target networks
        soft_update(self.critic_local, self.critic_target, TAU)
        soft_update(self.actor_local, self.actor_target, TAU)
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)
Esempio n. 32
0
class Learner(object):
    """
    Generic object which runs the main training loop of anything that trains using
    a replay buffer. Handles updating, logging, saving/loading, batching, etc.
    """
    def __init__(self, interactor_queue, lock, config, env_config, learner_config, **bonus_kwargs):
        self.learner_name = self.learner_name()
        self.interactor_queue = interactor_queue
        self.learner_lock = lock
        self.config = config
        self.env_config = env_config
        self.learner_config = learner_config
        self.bonus_kwargs = bonus_kwargs
        self.kill_threads = False
        self.permit_desync = False
        self.need_frames_notification = threading.Condition()
        self._reset_inspections()
        self.total_frames = 0

        self.save_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"], self.config["save_model_path"]))
        self.log_path = util.create_directory("%s/%s/%s/%s" % (self.config["output_root"], self.config["env"]["name"], self.config["name"],  self.config["log_path"])) + "/%s.log" % self.learner_name

        # replay buffer to store data
        self.replay_buffer_lock = threading.RLock()
        self.replay_buffer = ReplayBuffer(self.learner_config["replay_size"],
                                          np.prod(self.env_config["obs_dims"]),
                                          self.env_config["action_dim"])

        # data loaders pull data from the replay buffer and put it into the tfqueue for model usage
        self.data_loaders = self.make_loader_placeholders()
        queue_capacity = np.ceil(1./self.learner_config["frames_per_update"]) if self.learner_config["frames_per_update"] else 100
        self.tf_queue = tf.FIFOQueue(capacity=queue_capacity, dtypes=[dl.dtype for dl in self.data_loaders])
        self.enqueue_op = self.tf_queue.enqueue(self.data_loaders)
        self.current_batch = self.tf_queue.dequeue()

        # build the TF graph for the actual model to train
        self.core, self.train_losses, self.train_ops, self.inspect_losses = self.make_core_model()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    ## Mandatory functions to override
    def learner_name(self): raise Exception('unimplemented: learner_name')
    def make_loader_placeholders(self): raise Exception('unimplemented: make_loader_placeholders')
    def make_core_model(self): raise Exception('unimplemented: make_core_model')

    ## Optional functions to override
    def initialize(self): warnings.warn('unimplemented: initialize')
    def resume_from_checkpoint(self, epoch): warnings.warn('unimplemented: resume_from_checkpoint')
    def checkpoint(self): warnings.warn('unimplemented: checkpoint')
    def backup(self): warnings.warn('unimplemented: backup')

    ## Internal functions
    def _start(self):
        # fetch data from the interactors to pre-fill the replay buffer
        self.prefetch_thread = threading.Thread(target=self._poll_interactors, args=(True, self.learner_config["frames_before_learning"],))
        self.prefetch_thread.start()
        self.prefetch_thread.join()

        # start the interactor and data loader
        self.data_load_thread = threading.Thread(target=self._run_enqueue_data)
        self.data_load_thread.start()

        # initialize the learner, pretraining if needed
        if self.config["resume"]: self._resume_from_checkpoint()
        else:                     self._initialize()

        # re-sync everything, and start up interactions with the environment
        self.interactor_poll_thread = threading.Thread(target=self._poll_interactors)
        self.interactor_poll_thread.start()

        # start the clock
        self._last_checkpoint_time = time.time()

    def _learn(self, permit_desync=False, log=True, checkpoint=True, backup=True):
        # this is to keep the frames/update synced properly
        if self.learner_config["frames_per_update"] is not False and not permit_desync:
            if not self._have_enough_frames():
                with self.need_frames_notification:
                    self.need_frames_notification.notify()
                return

        # log
        if log and (self.update_i + 1) % self.learner_config["log_every_n"] == 0:
            self._log()

        # checkpoint
        if checkpoint and (self.update_i + 1) % self.learner_config["epoch_every_n"] == 0:
            self._checkpoint()

        # backup
        if backup and (self.update_i + 1) % self.learner_config["backup_every_n"] == 0:
            self._backup()

        # train
        self._training_step()

    def _have_enough_frames(self):
        gathered_frames = self.total_frames - self.learner_config["frames_before_learning"]
        return gathered_frames > self.learner_config["frames_per_update"] * self.update_i

    def _initialize(self):
        self.epoch = 0
        self.update_i = 0
        self.hours = 0
        self._last_checkpoint_time = time.time()

        self.initialize()

        if self.learner_config["pretrain_n"]: self._pretrain()
        self._checkpoint()

    def _pretrain(self):
        for _ in range(self.learner_config["pretrain_n"]):
            self._learn(permit_desync=True, checkpoint=False, backup=False)
        self.epoch = 0
        self.update_i = 0

    def _resume_from_checkpoint(self):
        epoch = util.get_largest_epoch_in_dir(self.save_path, self.core.saveid)
        if not self.config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
        if epoch is False:
            raise Exception("Tried to reload but no model found")
        with self.learner_lock:
            self.core.load(self.sess, self.save_path, epoch)
            self.epoch, self.update_i, self.total_frames, self.hours = self.sess.run([self.core.epoch_n, self.core.update_n, self.core.frame_n, self.core.hours])
        with self.replay_buffer_lock:
            self.replay_buffer.load(self.save_path, '%09d_%s' % (epoch, self.learner_name))
        self.resume_from_checkpoint(epoch)

    def _log(self):
        if self.denom > 0:
            logstring = "(%3.2f sec) h%-8.2f e%-8d s%-8d f%-8d\t" % (time.time() - self._log_time, self.hours, self.epoch, self.update_i + 1, self.total_frames) + ', '.join(["%8f" % x for x in (self.running_total / self.denom).tolist()])
            print("%s\t%s" % (self.learner_name, logstring))
            with open(self.log_path, "a") as f: f.write(logstring + "\n")
        self._reset_inspections()

    def _reset_inspections(self):
        self.running_total = 0.
        self.denom = 0.
        self._log_time = time.time()

    def _checkpoint(self):
        self.checkpoint()
        self.epoch += 1
        self.hours += (time.time() - self._last_checkpoint_time) / 3600.
        self._last_checkpoint_time = time.time()
        self.core.update_epoch(self.sess, self.epoch, self.update_i, self.total_frames, self.hours)
        with self.learner_lock: self.core.save(self.sess, self.save_path)

    def _backup(self):
        self.backup()
        if not self.learner_config['keep_all_replay_buffers']: util.wipe_all_but_largest_epoch_in_dir(self.save_path, self.core.saveid)
        with self.learner_lock:
            self.core.save(self.sess, self.save_path, self.epoch)
        with self.replay_buffer_lock:
            self.replay_buffer.save(self.save_path, '%09d_%s' % (self.epoch, self.learner_name))

    def _training_step(self):
        train_ops = tuple([op for op, loss in zip(self.train_ops,
                                                  self.train_losses)
                           if loss is not None])
        outs = self.sess.run(train_ops + self.inspect_losses)
        self.running_total += np.array(outs[len(train_ops):])
        self.denom += 1.
        self.update_i += 1

    def _poll_interactors(self, continuous_poll=False, frames_before_terminate=None):
        # poll the interactors for new frames.
        # the synced_condition semaphore prevents this from consuming too much CPU
        while not self.kill_threads:
            if self.learner_config["frames_per_update"] is not False and not continuous_poll:
                with self.need_frames_notification: self.need_frames_notification.wait()
            while not self.interactor_queue.empty():
                new_frames = self.interactor_queue.get()
                self._add_frames(new_frames)
                if frames_before_terminate and self.total_frames >= frames_before_terminate: return

    def _add_frames(self, frames):
        with self.replay_buffer_lock:
            for frame in frames:
                self.replay_buffer.add_replay(*frame)
            self.total_frames = self.replay_buffer.count
        return self.total_frames

    def _run_enqueue_data(self):
        while not self.kill_threads:
            data = self.replay_buffer.random_batch(self.learner_config["batch_size"])
            self.sess.run(self.enqueue_op, feed_dict=dict(list(zip(self.data_loaders, data))))

    def _kill_threads(self):
        self.kill_threads = True