Exemple #1
0
class Trainer(object):
    def __init__(self, agent, env):
        self.agent = agent
        self.env = env
        self.seed = random.randint(0, 20180818)
        self.optimizer = optim.Adam(agent.parameters, lr=LEARNING_RATE)
        self.buffer = ReplayBuffer(capacity=CAPACITY)
        self.total_step = 0

    def run(self, device='cpu', buffer=False, explore=False):
        """Run an episode and buffer"""
        self.env.reset()
        self.env.env.seed(self.seed)
        state = self.env.get_screen()
        states = np.asarray([state for _ in range(4)])  # shape (4, 84, 84)
        step = 0
        accumulated_reward = 0
        while True:
            action = self.agent.make_action(torch.Tensor([states]).to(device),
                                            explore=explore)
            state_next, reward, done = self.env.step(action)
            states_next = np.concatenate([states[1:, :, :], [state_next]],
                                         axis=0)
            step += 1
            accumulated_reward += reward
            if buffer:
                self.buffer.append(states, action, reward, states_next, done)
            states = states_next
            if done:
                break
        return accumulated_reward, step

    def _fill_buffer(self, num, device='cpu'):
        start = time.time()
        while self.buffer.size < num:
            self.run(device, buffer=True, explore=True)
            print('Fill buffer: {}/{}'.format(self.buffer.size,
                                              self.buffer.capacity))
        print('Filling buffer takes {:.3f} seconds'.format(time.time() -
                                                           start))

    def train(self, device='cpu'):
        self.env.change_record_every_episode(100000000)
        self._fill_buffer(OBSERV, device)
        if self.env.record_every_episode:
            self.env.change_record_every_episode(self.env.record_every_episode)

        episode = 0
        while 'training' != 'converge':
            self.env.reset()
            state = self.env.get_screen()
            states = np.asarray([state for _ in range(4)])  # shape (4, 84, 84)
            step_prev = self.total_step
            accumulated_reward = 0
            done = False
            n_flap = 0
            n_none = 0
            while not done:
                #### --------------------
                #### Add a new transition
                action = self.agent.make_action(torch.Tensor([states
                                                              ]).to(device),
                                                explore=True)
                state_next, reward, done = self.env.step(action)
                states_next = np.concatenate([states[1:, :, :], [state_next]],
                                             axis=0)
                self.total_step += 1
                accumulated_reward += reward
                self.buffer.append(states, action, reward, states_next, done)
                states = states_next
                #### --------------------

                #### --------------------
                #### Training step
                start = time.time()
                # prepare training data
                minibatch = self.buffer.sample(n_sample=BATCH)
                _states = [b[0] for b in minibatch]
                _actions = [b[1] for b in minibatch]
                _rewards = [b[2] for b in minibatch]
                _states_next = [b[3] for b in minibatch]
                _dones = [b[4] for b in minibatch]

                ys = []
                for i in range(len(minibatch)):
                    terminal = _dones[i]
                    r = _rewards[i]
                    if terminal:
                        y = r
                    else:
                        # Double DQN
                        s_t_next = torch.Tensor([_states_next[i]]).to(device)
                        online_act = self.agent.make_action(s_t_next)
                        y = r + DISCOUNT * self.agent.Q(
                            s_t_next, online_act, target=True)
                    ys.append(y)
                ys = torch.Tensor(ys).to(device)

                # Apply gradient
                self.optimizer.zero_grad()
                input = torch.Tensor(_states).to(device)
                output = self.agent.net(input)  # shape (BATCH, 2)
                actions_one_hot = np.zeros([BATCH, 2])
                actions_one_hot[np.arange(BATCH), _actions] = 1.0
                actions_one_hot = torch.Tensor(actions_one_hot).to(device)
                ys_hat = (output * actions_one_hot).sum(dim=1)
                loss = F.smooth_l1_loss(ys_hat, ys)
                loss.backward()
                self.optimizer.step()
                #### --------------------

                # logging
                if action == 0:
                    n_flap += 1
                else:
                    n_none += 1

                if done and self.total_step % LOGGING_CYCLE == 0:
                    log = '[{}, {}] alive: {}, reward: {}, F/N: {}/{}, loss: {:.4f}, epsilon: {:.4f}, time: {:.3f}'.format(
                        episode, self.total_step, self.total_step - step_prev,
                        accumulated_reward, n_flap, n_none, loss.item(),
                        self.agent.epsilon,
                        time.time() - start)
                    print(log)

                self.agent.update_epsilon()
                if self.total_step % TARGET_UPDATE_CYCLE == 0:
                    #print('[Update target network]')
                    self.agent.update_target()

                if self.total_step % SAVE_MODEL_CYCLE == 0:
                    print('[Save model]')
                    self.save(id=self.total_step)
            episode += 1

    def save(self, id):
        filename = 'tmp/models/model_{}.pth.tar'.format(id)
        dirpath = os.path.dirname(filename)
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        checkpoint = {
            'net': self.agent.net.state_dict(),
            'target': self.agent.target.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'total_step': self.total_step
        }
        torch.save(checkpoint, filename)

    def load(self, filename, device='cpu'):
        ckpt = torch.load(filename, map_location=lambda storage, loc: storage)
        ## Deal with the missing of bn.num_batches_tracked
        net_new = OrderedDict()
        tar_new = OrderedDict()

        for k, v in ckpt['net'].items():
            for _k, _v in self.agent.net.state_dict().items():
                if k == _k:
                    net_new[k] = v

        for k, v in ckpt['target'].items():
            for _k, _v in self.agent.target.state_dict().items():
                if k == _k:
                    tar_new[k] = v

        self.agent.net.load_state_dict(net_new)
        self.agent.target.load_state_dict(tar_new)
        ## -----------------------------------------------

        self.optimizer.load_state_dict(ckpt['optimizer'])
        self.total_step = ckpt['total_step']
Exemple #2
0
class DQNAgent:
    def __init__(
            self,
            n_actions,
            learning_rate=0.001,
            gamma=0.9,
            #gamma=0.95,
            batch_size=64,
            replay_buffer_size=200000,
            replay_start_size=1000):
        """
        :param n_actions: the number of possible actions
        :param learning_rate: the learning rate for the optimizer
        :param gamma: discount factor
        :param batch_size: size of a minibatch
        :param replay_buffer_size: the size of the replay memory
        :param replay_start_size: the initial size of the replay memory before learning starts
        :param target_update_interval: number of steps between consecutive updates of
                                       the target network
        """
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.batch_size = batch_size

        # Create the replay buffer
        #self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.replay_buffer = ReplayBuffer(max_size=replay_buffer_size)
        self.replay_start_size = replay_start_size

        # Build the Q-network
        self.q_network = self.build_model()

        # Create the target network as a copy of the Q-network
        self.target_network = keras.models.clone_model(self.q_network)

        # Create the optimizer
        self.optimizer = keras.optimizers.Adam(self.learning_rate)

        self.training_step = 0

    def build_model(self):
        model = keras.models.Sequential([
            layers.Dense(256, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(self.n_actions)
        ])
        return model

    def select_action(self, state, epsilon):
        """
        An epsilon-greedy action selection
        :param state: the current state of the environment
        :param epsilon: the exploration rate
        :return: an action
        """
        if np.random.rand() < epsilon:
            return np.random.choice(self.n_actions)
        else:
            q_values = self.q_network.predict(np.expand_dims(state, axis=0))[0]
            return np.argmax(q_values)

    def remember(self, state, action, reward, next_state, done, info):
        """Store a new transition in the replay buffer"""
        self.replay_buffer.append((state, action, reward, next_state, done))

    def sample_transitions(self):
        #indices = np.random.randint(len(self.replay_buffer), size=self.batch_size)
        #mini_batch = [self.replay_buffer[index] for index in indices]
        mini_batch = self.replay_buffer.sample(self.batch_size)

        states, actions, rewards, next_states, dones = [
            np.array([transition[field_index] for transition in mini_batch])
            for field_index in range(5)
        ]
        return states, actions, rewards, next_states, dones

    def train(self):
        """Perform a single training step on the network"""

        # Check that we have enough transitions in the replay buffer
        if len(self.replay_buffer) < max(self.batch_size,
                                         self.replay_start_size):
            return

        # Sample transitions from the replay buffer
        states, actions, rewards, next_states, dones = self.sample_transitions(
        )

        # Compute the target Q values for the sampled transitions
        next_q_values = self.target_network.predict(next_states)
        max_next_q_values = np.max(next_q_values, axis=1)
        target_q_values = rewards + (1 -
                                     dones) * self.gamma * max_next_q_values

        with tf.GradientTape() as tape:
            # Forward pass: compute the Q-values for the states in the batch
            all_q_values = self.q_network(states)

            # Mask out the Q-values for the non-chosen actions
            mask = tf.one_hot(actions, self.n_actions)
            q_values = tf.reduce_sum(all_q_values * mask, axis=1)

            # Compute the loss between the targets and the Q-values
            loss_fn = keras.losses.Huber()
            loss = loss_fn(target_q_values, q_values)

        # Perform a gradient descent step to minimize the loss with respect
        # to the model's trainable variables
        gradients = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.q_network.trainable_variables))

    def update_target_network(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def save_model(self, folder, env_id):
        """Save the network params to a file"""
        agent_file = os.path.join(folder, f'{env_id}.h5')
        keras.models.save_model(self.q_network, agent_file)

    def load_model(self, folder, env_id):
        """Load the network params from a file"""
        agent_file = os.path.join(folder, f'{env_id}.h5')
        self.q_network = keras.models.load_model(agent_file)
Exemple #3
0
class Agent:
    def __init__(self):
        self.name = "expected_sarsa_agent"

    def agent_init(self, agent_config):
        """Setup for the agent called when the experiment first starts.

        Set parameters needed to setup the agent.

        Assume agent_config dict contains:
        {
            network_pickle: string (optional),
            network_config: dictionary,
            optimizer_config: dictionary,
            replay_buffer_size: integer,
            minibatch_sz: integer, 
            num_replay_updates_per_step: float
            discount_factor: float,
        }
        """
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'],
                                          agent_config['minibatch_sz'],
                                          agent_config.get("seed"))
        if "network_pickle" in agent_config:
            self.network = pickle.load(
                open(agent_config["network_pickle"], 'rb'))
        else:
            self.network = ActionValueNetwork(agent_config['network_config'])
        self.optimizer = Adam(self.network.layer_sizes,
                              agent_config["optimizer_config"])
        self.num_actions = agent_config['network_config']['num_actions']
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        self.tau = agent_config['tau']

        self.rand_generator = np.random.RandomState(agent_config.get("seed"))

        self.last_state = None
        self.last_action = None

        self.sum_rewards = 0
        self.episode_steps = 0

    def policy(self, state):
        """
        Args:
            state (Numpy array): the state.
        Returns:
            the action. 
        """
        action_values = self.network.get_action_values(state)
        probs_batch = self.softmax(action_values, self.tau)
        action = self.rand_generator.choice(self.num_actions,
                                            p=probs_batch.squeeze())
        return action

    def agent_start(self, state):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            state (Numpy array): the state from the
                environment's evn_start function.
        Returns:
            The first action the agent takes.
        """
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = np.array([state])
        self.last_action = self.policy(self.last_state)
        return self.last_action

    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (Numpy array): the state from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """

        self.sum_rewards += reward
        self.episode_steps += 1

        state = np.array([state])
        action = self.policy(state)

        self.replay_buffer.append(self.last_state, self.last_action, reward, 0,
                                  state)

        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.optimize_network(experiences, current_q)

        self.last_state = state
        self.last_action = action

        return action

    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        self.sum_rewards += reward
        self.episode_steps += 1

        state = np.zeros_like(self.last_state)

        self.replay_buffer.append(self.last_state, self.last_action, reward, 1,
                                  state)
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):
                experiences = self.replay_buffer.sample()
                self.optimize_network(experiences, current_q)

    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")

    def softmax(self, action_values, tau=1.0):
        """
        Args:
        action_values (Numpy array): A 2D array of shape (batch_size, num_actions). 
                       The action-values computed by an action-value network.              
        tau (float): The temperature parameter scalar.
        Returns:
        A 2D array of shape (batch_size, num_actions). Where each column is a probability distribution over
        the actions representing the policy.
        """
        preferences = action_values / tau
        max_preference = np.amax(preferences, 1)

        reshaped_max_preference = max_preference.reshape((-1, 1))

        exp_preferences = np.exp(preferences - reshaped_max_preference)
        sum_of_exp_preferences = np.sum(exp_preferences, 1)

        reshaped_sum_of_exp_preferences = sum_of_exp_preferences.reshape(
            (-1, 1))

        action_probs = exp_preferences / reshaped_sum_of_exp_preferences

        action_probs = action_probs.squeeze()
        return action_probs

    def get_td_error(self, states, next_states, actions, rewards, terminals,
                     current_q):
        """
        Args:
        states (Numpy array): The batch of states with the shape (batch_size, state_dim).
        next_states (Numpy array): The batch of next states with the shape (batch_size, state_dim).
        actions (Numpy array): The batch of actions with the shape (batch_size,).
        rewards (Numpy array): The batch of rewards with the shape (batch_size,).
        discount (float): The discount factor.
        terminals (Numpy array): The batch of terminals with the shape (batch_size,).
        network (ActionValueNetwork): The latest state of the network that is getting replay updates.
        current_q (ActionValueNetwork): The fixed network used for computing the targets, 
                                        and particularly, the action-values at the next-states.
        Returns:
        The TD errors (Numpy array) for actions taken, of shape (batch_size,)
        """

        q_next_mat = np.apply_along_axis(current_q.get_action_values, 1,
                                         next_states).squeeze()

        probs_mat = self.softmax(q_next_mat, self.tau)

        v_next_vec = np.einsum("ij,ij->i", probs_mat, q_next_mat)
        v_next_vec *= (1 - terminals)

        target_vec = rewards + self.discount * v_next_vec

        q_mat = np.apply_along_axis(self.network.get_action_values, 1,
                                    states).squeeze()

        batch_indices = np.arange(q_mat.shape[0])

        q_vec = np.array([q_mat[i][actions[i]] for i in batch_indices])

        delta_vec = target_vec - q_vec

        return delta_vec

    def optimize_network(self, experiences, current_q):
        """
        Args:
        experiences (Numpy array): The batch of experiences including the states, actions, 
                                   rewards, terminals, and next_states.
        discount (float): The discount factor.
        network (ActionValueNetwork): The latest state of the network that is getting replay updates.
        current_q (ActionValueNetwork): The fixed network used for computing the targets, 
                                        and particularly, the action-values at the next-states.
        """

        states, actions, rewards, terminals, next_states = map(
            list, zip(*experiences))
        states = np.concatenate(states)
        next_states = np.concatenate(next_states)
        rewards = np.array(rewards)
        terminals = np.array(terminals)
        batch_size = states.shape[0]

        delta_vec = self.get_td_error(states, next_states, actions, rewards,
                                      terminals, current_q)
        batch_indices = np.arange(batch_size)

        delta_mat = np.zeros((batch_size, self.network.num_actions))
        delta_mat[batch_indices, actions] = delta_vec

        td_update = self.network.get_TD_update(states, delta_mat)

        weights = self.optimizer.update_weights(self.network.get_weights(),
                                                td_update)

        self.network.set_weights(weights)
Exemple #4
0
def worker_process(job_name, task_index, cluster_dict, file_name):
    import tensorflow as tf
    # GPU training.
    if USE_GPU:
        os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=PER_PROCESS_GPU_MEMORY_FRACTION)
        config = tf.ConfigProto(gpu_options=gpu_options)
    else:
        config = None

    # Create and start a server for the local task.
    cluster = tf.train.ClusterSpec(cluster_dict)
    server = tf.train.Server(cluster,
                             job_name=job_name,
                             task_index=task_index,
                             config=config)

    if job_name == "ps":
        # Parameter server.
        with tf.device("/job:" + job_name + "/task:" + str(task_index)):
            queue = tf.FIFOQueue(cluster.num_tasks("worker"),
                                 tf.int32,
                                 shared_name="done_queue" + str(task_index))
        # Close the parameter server when all queues from workers have been filled.
        with tf.Session(server.target) as sess:
            for i in range(cluster.num_tasks("worker")):
                sess.run(queue.dequeue())
        return []

    elif job_name == "worker":
        # Obtain environment parameters.
        env = make_atari(ENV_NAME)
        obs_space = env.observation_space
        action_space = env.action_space

        # Worker.
        with tf.device(
                tf.train.replica_device_setter(worker_device="/job:" +
                                               job_name + "/task:" +
                                               str(task_index),
                                               cluster=cluster)):
            # Build networks.
            main_network = QValueNetwork(obs_space,
                                         action_space,
                                         name="main_network")
            target_network = QValueNetwork(obs_space,
                                           action_space,
                                           name="target_network",
                                           auxiliary_network=main_network)

        replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE)
        list_episodic_reward = []
        episodic_reward = 0
        obs = env.reset()

        # Additional settings for the first worker (task_index = 0).
        if task_index == 0:
            saver = tf.train.Saver(var_list=main_network.variables,
                                   max_to_keep=1)
            next_target_network_update_step = 0
            next_autosave_step = 0

        with tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=(task_index == 0),
                config=config,
                save_summaries_steps=None,
                save_summaries_secs=None,
                save_checkpoint_steps=None,
                save_checkpoint_secs=None) as sess:

            # Initialize buffers.
            for _ in range(INITIAL_BUFFER_SIZE):
                # Sample random action.
                action = np.random.randint(action_space.n)
                # Interact with the environment.
                obs_next, reward, done, _ = env.step(action)
                episodic_reward += reward
                if done:
                    obs_next = env.reset()
                    episodic_reward = 0
                # Store data.
                data = [obs, action, reward, done, obs_next]
                replay_buffer.append(data)
                # Update observation.
                obs = obs_next

            # Run until reaching maximum training steps.
            while sess.run(main_network.global_step) < TOTAL_STEP:
                global_step = sess.run(main_network.global_step)
                if task_index == 0:
                    # Synchronize the target network periodically (target network <- main network).
                    if global_step >= next_target_network_update_step:
                        sess.run(target_network.sync_op)
                        next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP

                # Sample action with epsilon-greedy policy.
                epsilon = EPSILON_MAX - (
                    EPSILON_MAX - EPSILON_MIN) * np.minimum(
                        global_step / EPSILON_DECAY_STEP, 1)
                if np.random.uniform() < epsilon:
                    action = np.random.randint(action_space.n)
                else:
                    q = sess.run(target_network.q,
                                 feed_dict={
                                     target_network.Obs:
                                     np.expand_dims(np.array(obs) / 255.0, 0)
                                 })
                    action = np.argmax(q[0])
                # Interact with the environment.
                obs_next, reward, done, _ = env.step(action)
                episodic_reward += reward
                if done:
                    obs_next = env.reset()
                    list_episodic_reward.append((global_step, episodic_reward))
                    delta_time = int(time.time() - start_time)
                    print("Step ",
                          global_step,
                          "/",
                          TOTAL_STEP,
                          ": Time spent = ",
                          delta_time,
                          " s , Episodic reward = ",
                          episodic_reward,
                          sep="")
                    episodic_reward = 0
                # Store data.
                data = [obs, action, reward, done, obs_next]
                replay_buffer.append(data)
                # Update observation.
                obs = obs_next

                # Learning rate.
                lr = LEARNING_RATE[-1]
                for i in range(len(LR_ANNEAL_STEP)):
                    if global_step < LR_ANNEAL_STEP[i]:
                        lr = LEARNING_RATE[i]
                        break

                # Sample training data from the replay buffer.
                batch_data = replay_buffer.sample(BATCH_SIZE)
                batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
                  [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]

                # Compute the target Q value:
                #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
                q_next = sess.run(
                    target_network.q,
                    feed_dict={target_network.Obs: batch_obs_next / 255.0})
                max_qnext = np.amax(q_next, axis=1)
                target_q = batch_reward + (
                    1 - batch_done) * REWARD_DISCOUNT * max_qnext

                # Update the main network (main network <- local network gradients).
                sess.run(main_network.train_op,
                         feed_dict={
                             main_network.Obs: batch_obs / 255.0,
                             main_network.Action: batch_action,
                             main_network.TargetQ: target_q,
                             main_network.LR: lr
                         })

                if task_index == 0:
                    # Save the main network periodically.
                    if global_step >= next_autosave_step:
                        saver.save(sess._sess._sess._sess._sess,
                                   SAVE_DIR + file_name)
                        next_autosave_step += AUTOSAVE_STEP

            if task_index == 0:
                # Save the main network.
                saver.save(sess._sess._sess._sess._sess, SAVE_DIR + file_name)

        tf.contrib.keras.backend.clear_session()
        # Close the environment.
        env.close()

        queues = []
        # Create a shared queue on the worker which is visible on the parameter server.
        for i in range(cluster.num_tasks("ps")):
            with tf.device("/job:ps/task:" + str(i)):
                queue = tf.FIFOQueue(cluster.num_tasks("worker"),
                                     tf.int32,
                                     shared_name="done_queue" + str(i))
                queues.append(queue)
        # Notify all parameter servers that the current worker has finished the task.
        with tf.Session(server.target) as sess:
            for i in range(cluster.num_tasks("ps")):
                sess.run(queues[i].enqueue(task_index))
        # Release memory when a worker is finished.
        tf.contrib.keras.backend.clear_session()

        return list_episodic_reward
Exemple #5
0
class RL_Agent:
    """
    Main activities:
    - Making actual moves in a game.
    - Making search moves during MCTS
    - Updating the target policy via supervised learning

    Must contain the ANET(the actor network)

    Should save net params to file

    """

    def __init__(self, input_shape, output_size, anet_layer_sizes, anet_layer_activations, gm: GameManager,
                 model_load_path, e_greedy, lr=0.01, optimizer="adam"):
        self.e_greedy = e_greedy
        self.anet = ActorNetwork(hidden_layer_sizes=anet_layer_sizes,
                                 hidden_activations=anet_layer_activations,
                                 optimizer=optimizer,
                                 input_shape=input_shape,
                                 output_size=output_size,
                                 model_load_path=model_load_path,
                                 gm=gm,
                                 lr=lr)

        self.rbuf = ReplayBuffer(size_x=len(gm.get_start_state()),
                                 size_y=output_size,
                                 max_size=5000,
                                 default_batch_size=64)
        self.env = gm

    def normalize_action_values(self, action_values, actions_available):
        d = defaultdict()
        all_act = self.env.get_all_actions()
        for action in actions_available:
            idx = all_act.index(action)
            d[action] = action_values[-1][idx]
        return {k: v / total for total in (sum(d.values()),) for k, v in d.items()}

    def default_policy(self, state):  # make general for game.
        moves = self.env.legal_actions(state)
        if random() < self.e_greedy:
            return choice(moves)

        prediction = self.anet.predict(state)
        normalized_prediction = self.normalize_action_values(prediction, moves)  # too slow!
        return max(normalized_prediction, key=lambda action: normalized_prediction[action])

    def retain(self, state, edge_visits):

        self.rbuf.append(x=state, y=self.get_distribution(edge_visits))

    def get_distribution(self, edge_visits: dict):
        """
        dict should be on form {action:visits}
        :param Edge_visits:
        :return:
        """
        all_acts = self.env.get_all_actions()
        distribution = np.zeros(len(all_acts))
        s = sum(edge_visits.values())
        for action, visits in edge_visits.items():
            idx = all_acts.index(action)
            distribution[idx] = visits / s
        return distribution

    def train_rbuf(self, verbose):
        history = self.anet.train(self.rbuf.minibatch())
        if verbose:
            # do something about loss
            print(history.history["loss"])
        return history

    def extend_saved_rbuf(self):
        self.rbuf.save()
Exemple #6
0
env = Gridworld(10)
gpu_num = 1

dqn = GoalQWrapper(env, 'dqn', 0)
buffer = ReplayBuffer(100000)

steps_before_train = 1000
viz_freq = 1000
batch_size = 32

s = env.reset()
for time in itertools.count():

    a = np.random.randint(0, 4)
    sp, r, t, info = env.step(a)
    buffer.append(s, a, r, sp, t)
    s = sp
    if time < steps_before_train:
        continue

    s_batch, a_batch, r_batch, sp_batch, t_batch = buffer.sample(batch_size)
    g_batch, _, _, _, _ = buffer.sample(batch_size)
    loss = dqn.train_batch_goals(time, s_batch, a_batch, sp_batch, g_batch)
    print(time, loss)

    if time % viz_freq == 0:
        visualize_all_values(dqn, env.get_all_states())



Exemple #7
0
def training(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(CSV_DIR):
        os.makedirs(CSV_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Load models.
    actor = Actor(name="actor")
    actor_target = Actor(name="actor_target")
    actor_initial_update_op = target_update_op(
        actor.trainable_variables, actor_target.trainable_variables, 1.0)
    actor_target_update_op = target_update_op(actor.trainable_variables,
                                              actor_target.trainable_variables,
                                              TARGET_UPDATE_RATE)

    critic = Critic(name="critic")
    critic.build_training()
    critic_target = Critic(name="critic_target")
    critic_initial_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables, 1.0)
    critic_target_update_op = target_update_op(
        critic.trainable_variables, critic_target.trainable_variables,
        TARGET_UPDATE_RATE)

    critic_with_actor = Critic(name="critic", A=actor.pi)
    actor.build_training(critic_with_actor.actor_loss)

    env = PendulumEnv()
    replay_buffer = ReplayBuffer(BUFFER_SIZE)
    action_noise = OUActionNoise(np.zeros(A_LENGTH))

    with tf.Session() as sess:
        # Initialize actor and critic networks.
        sess.run(tf.global_variables_initializer())
        sess.run([actor_initial_update_op, critic_initial_update_op])

        list_final_reward = []

        additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME))
        for episode in range(-additional_episode, MAX_EPISODE):
            list_actor_loss = []
            list_critic_loss = []

            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                noise = action_noise.get_noise()
                a = a[0] + ACTION_SCALING * noise
                a = np.clip(a, -ACTION_SCALING, ACTION_SCALING)

                # Interact with the game engine.
                s1, r, _, _ = env.step(a)

                # Add data to the replay buffer.
                data = [s, a, [r], s1]
                replay_buffer.append(data)

                if episode >= 0:
                    for _ in range(BATCHES_PER_STEP):
                        # Sample data from the replay buffer.
                        batch_data = replay_buffer.sample(BATCH_SIZE)
                        batch_s, batch_a, batch_r, batch_s1 = [
                            np.array(
                                [batch_data[j][i] for j in range(BATCH_SIZE)])
                            for i in range(len(batch_data[0]))
                        ]

                        # Compute the next action.
                        a1 = sess.run(actor_target.pi,
                                      feed_dict={actor_target.S: batch_s1})

                        # Compute the target Q.
                        q1 = sess.run(critic_target.q,
                                      feed_dict={
                                          critic_target.S: batch_s1,
                                          critic_target.A: a1
                                      })
                        q_target = batch_r + DISCOUNT * q1

                        # Update actor and critic.
                        _, _, actor_loss, critic_loss = sess.run(
                            [
                                actor.train_op, critic.train_op,
                                actor.actor_loss, critic.critic_loss
                            ],
                            feed_dict={
                                actor.S: batch_s,
                                critic_with_actor.S: batch_s,
                                actor.LR: LR_ACTOR,
                                critic.S: batch_s,
                                critic.A: batch_a,
                                critic.QTarget: q_target,
                                critic.LR: LR_CRITIC
                            })
                        list_actor_loss.append(actor_loss)
                        list_critic_loss.append(critic_loss)

                        # Update target networks.
                        sess.run(
                            [actor_target_update_op, critic_target_update_op])

                s = s1

            # Postprocessing after each episode.
            if episode >= 0:
                list_final_reward.append(r)
                avg_actor_loss = np.mean(list_actor_loss)
                avg_critic_loss = np.mean(list_critic_loss)
                print("Episode ", format(episode, "03d"), ":", sep="")
                print("  Final Reward = ",
                      format(r, ".6f"),
                      ", Actor Loss = ",
                      format(avg_actor_loss, ".6f"),
                      ", Critic Loss = ",
                      format(avg_critic_loss, ".6f"),
                      sep="")

        # Testing.
        avg_reward = 0
        for i in range(TEST_EPISODE):
            # Reset the environment and noise.
            s = env.reset()
            action_noise.reset()

            for step in range(MAX_FRAME):
                env.render()

                # Get action.
                a = sess.run(actor.pi,
                             feed_dict={actor.S: np.reshape(s, (1, -1))})
                a = a[0]

                # Interact with the game engine.
                s, r, _, _ = env.step(a)

            # Postprocessing after each episode.
            avg_reward += r
        avg_reward /= TEST_EPISODE

        # Save the parameters.
        saver = tf.train.Saver(
            [*actor.trainable_variables, *critic.trainable_variables])
        saver.save(sess, SAVE_DIR + file_name)
    tf.contrib.keras.backend.clear_session()
    env.close()

    # Store data in the csv file.
    with open(CSV_DIR + file_name + ".csv", "w") as f:
        fieldnames = ["Episode", "Final Reward", "Average Reward"]
        writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n")
        writer.writeheader()
        for episode in range(MAX_EPISODE):
            content = {
                "Episode": episode,
                "Final Reward": list_final_reward[episode]
            }
            if episode == MAX_EPISODE - 1:
                content.update({"Average Reward": avg_reward})
            writer.writerow(content)

    # Plot the training process.
    list_episode = list(range(MAX_EPISODE))
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
    ax.plot(list_episode, list_final_reward, "r-", label="Final Reward")
    ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward")
    ax.set_title("Final Reward")
    ax.set_xlabel("Episode")
    ax.set_ylabel("Reward")
    ax.legend(loc="lower right")
    ax.grid()

    f.savefig(FIGURE_TRAINING_DIR + file_name + ".png")
    plt.close(f)
def train(env, agent, args):
    """
    Trains the given agent in the given environment,
    following the specification in the arguments passed via command-line
    :param env: environment
    :type env: OpenAI gym environment
    :param agent: agent to be trained
    :type agent: SAC
    :param args: the arguments parsed from command-line
    :type args: object returned by argparse library
    :return: array with the returns per episode cumulated by the agent during training
    :rtype: numpy array of dtype float32
    """

    if args.max_episode_steps is not None:
        # if user has specified a maximum number of steps per episode, set it
        env.set_max_episode_steps(args.max_episode_steps)

    # build replay buffer
    replay_buffer = ReplayBuffer(args.replay_size)

    total_steps = 0
    updates = 0
    returns = []
    epsilon = args.initial_epsilon

    # for each episode counting from 1
    for i_episode in itertools.count(1):
        # reset the environment and the episode counters, and get the initial state
        state = env.reset()
        episode_return = 0
        i_step = 0

        # for each step in the episode
        for i_step in itertools.count(0):
            if args.render:
                env.render()

            # if user has specified a number of initial exploratory steps,
            # then just sample a random action from the environment action space
            # if user has specified an epsilon randomness different from zero (and the exploratory steps are over)
            # then just sample a random action from the environment action space
            # otherwise let the agent choose an appropriate action
            if total_steps <= args.exploratory_steps:
                action = env.action_space.sample()
            elif epsilon > 0 and np.random.uniform(0, 1) <= epsilon:
                action = env.action_space.sample()
            else:
                action = agent.choose_action(state)

            # perform the action and observe the resulting next state, reward and done signal
            next_state, reward, done, _ = env.step(action)

            # if very verbose print per step log
            if args.verbose >= 2:
                print("Step: {}".format(i_step))
                print("(s,a,r,s',d): ({}, {}, {}, {}, {})".format(
                    state, action, reward, next_state, done))

            # append observed transition to replay buffer
            replay_buffer.append(state, action, reward, next_state, done)

            # if user has specified a number of steps without having the agent update its networks (and learn),
            # then skip the update
            # if that phase is over, then proceed to update agent's networks
            if total_steps > args.learning_starts and len(
                    replay_buffer) > args.batch_size:
                for _ in range(args.gradient_steps):
                    q1l, q2l, pl, al = agent.update(replay_buffer,
                                                    args.batch_size, updates)
                    if args.verbose >= 2:
                        print("Losses: ({}, {}, {}, {})".format(
                            q1l, q2l, pl, al))
                    updates += 1

            # update per step variables and cumulate episode return
            state = next_state
            episode_return += reward
            i_step += 1
            total_steps += 1

            # if received done signal from the environment, then terminate the episode
            if done:
                break

        # append the cumulated episode return to the array
        returns.append(episode_return)

        # if verbose print a summary of the training occurred in the last episode
        if args.verbose >= 1:
            summary = "Episode: {}. Steps: {}. Episode steps: {}. Episode return: {:.3f}.\n".format(
                i_episode, total_steps, i_step, episode_return)
            if args.learning_starts > total_steps:
                summary += "Learning starts in: {} steps. ".format(
                    args.learning_starts - total_steps)
            if args.exploratory_steps > total_steps:
                summary += "Exploratory steps left: {}. ".format(
                    args.exploratory_steps - total_steps)
            elif epsilon > 0:
                summary += "Epsilon: {:.3f}.".format(epsilon)

            print(summary)

        # if user has specified plotting, then plot the returns cumulated so far
        if args.plot and i_episode % args.plot_interval == 0:
            plot_mean_k_episodes_return(returns)

        # if user has specified a fixed number of training episodes, check if time is up
        if args.train_episodes is not None and i_episode >= args.train_episodes:
            break

        # update epsilon randomness coefficient,
        # if still positive and if exploratory phase is over and learning has started
        # linear decrease update wins over exponential decay update, in case user specified both
        if epsilon > 0 and \
        total_steps > args.learning_starts and \
        total_steps > args.exploratory_steps:
            if args.epsilon_decrease > 0 and epsilon > args.final_epsilon:
                epsilon = max(args.final_epsilon,
                              epsilon - args.epsilon_decrease)
            elif args.epsilon_decay > 0:
                epsilon *= args.epsilon_decay

    return np.array(returns)
Exemple #9
0
def train(file_name):
  # Create folders.
  if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)
  if not os.path.isdir(FIGURE_TRAINING_DIR):
    os.makedirs(FIGURE_TRAINING_DIR)
  
  # Obtain environment parameters.
  env = make_atari(ENV_NAME)
  obs_space = env.observation_space
  action_space = env.action_space
  
  # Build networks.
  main_network = QValueNetwork(obs_space, action_space, name = "main_network")
  target_network = QValueNetwork(obs_space, action_space, name = "target_network", auxiliary_network = main_network)
  variables_initializer = tf.global_variables_initializer()
  
  replay_buffer = ReplayBuffer(buffer_size = BUFFER_SIZE)
  start_time = time.time()
  list_episodic_reward = []
  episodic_reward = 0
  
  obs = env.reset()
  
  with tf.Session() as sess:
    # Initialize all variables.
    sess.run(variables_initializer)
    # Only save the main network.
    saver = tf.train.Saver(var_list = main_network.variables)
    
    # Initialize buffers.
    for _ in range(INITIAL_BUFFER_SIZE):
      # Sample random action.
      action = np.random.randint(action_space.n)
      # Interact with the environment.
      obs_next, reward, done, _ = env.step(action)
      episodic_reward += reward
      if done:
        obs_next = env.reset()
        episodic_reward = 0
      # Store data.
      data = [obs, action, reward, done, obs_next]
      replay_buffer.append(data)
      # Update observation.
      obs = obs_next
    
    for step in range(TOTAL_STEP):
      # Synchronize the target network periodically (target network <- main network).
      if step % TARGET_NETWORK_UPDATE_STEP == 0:
        sess.run(target_network.sync_op)
      
      # Sample action with epsilon-greedy policy.
      epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(step / EPSILON_DECAY_STEP, 1)
      if np.random.uniform() < epsilon:
        action = np.random.randint(action_space.n)
      else:
        q = sess.run(target_network.q, feed_dict = {target_network.Obs: np.expand_dims(np.array(obs) / 255.0, 0)})
        action = np.argmax(q[0])
      # Interact with the environment.
      obs_next, reward, done, _ = env.step(action)
      episodic_reward += reward
      if done:
        obs_next = env.reset()
        list_episodic_reward.append((step, episodic_reward))
        delta_time = int(time.time() - start_time)
        print("Step ", step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward, sep = "")
        episodic_reward = 0
      # Store data.
      data = [obs, action, reward, done, obs_next]
      replay_buffer.append(data)
      # Update observation.
      obs = obs_next
      
      # Learning rate.
      lr = LEARNING_RATE[-1]
      for i in range(len(LR_ANNEAL_STEP)):
        if step < LR_ANNEAL_STEP[i]:
          lr = LEARNING_RATE[i]
          break
      
      # Sample training data from the replay buffer.
      batch_data = replay_buffer.sample(BATCH_SIZE)
      batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
        [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]
      
      # Compute the target Q value:
      #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
      q_next = sess.run(target_network.q, feed_dict = {target_network.Obs: batch_obs_next / 255.0})
      max_qnext = np.amax(q_next, axis = 1)
      target_q = batch_reward + (1 - batch_done) * REWARD_DISCOUNT * max_qnext
      
      # Update the main network.
      sess.run(main_network.train_op, feed_dict = {
        main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr
        })
      
      # Save the main network periodically.
      if step % AUTOSAVE_STEP == 0:
        saver.save(sess, SAVE_DIR + file_name)
    
    # Save the main network.
    saver = tf.train.Saver(var_list = main_network.variables)
    saver.save(sess, SAVE_DIR + file_name)
  
  total_time = int(time.time() - start_time)
  print("Training finished in ", total_time, " s.", sep = "")
  
  # Close the environment.
  env.close()
  
  # Plot the episodic reward against training step curve.
  plot_episodic_reward(list_episodic_reward, file_name)
class Agent:

    Transition = namedtuple(
        'Transition', ('state', 'action', 'next_state', 'reward', 'done'),
        rename=False)  # 'rename' means not to overwrite invalid field

    def __init__(self, env, hyperparameters, device, writer, max_games,
                 tg_bot):
        self.eps_start = hyperparameters['eps_start']
        self.eps_end = hyperparameters['eps_end']
        self.eps_decay = hyperparameters['eps_decay']
        self.epsilon = hyperparameters['eps_start']
        self.n_iter_update_nn = hyperparameters['n_iter_update_nn']
        self.max_games = max_games
        self.tg_bot = tg_bot
        self.env = env

        self.agent_control = AgentControl(env, device,
                                          hyperparameters['learning_rate'],
                                          hyperparameters['gamma'],
                                          hyperparameters['multi_step'],
                                          hyperparameters['double_dqn'],
                                          hyperparameters['dueling'])
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_size'],
                                          hyperparameters['buffer_minimum'],
                                          hyperparameters['multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = writer

        self.num_iterations = 0
        self.total_reward = 0
        self.num_games = 0
        self.total_loss = []
        self.ts_frame = 0
        self.ts = time.time()
        self.birth_time = time.time()
        self.rewards = []

        if self.tg_bot:
            tg.welcome_msg(hyperparameters['multi_step'],
                           hyperparameters['double_dqn'],
                           hyperparameters['dueling'])

    def select_greedy_action(self, obs):
        # Give current state to the control who will pass it to NN which will
        # return all actions and the control will take max and return it here
        return self.agent_control.select_greedy_action(obs)

    def select_eps_greedy_action(self, obs):
        rand_num = random.rand()
        if self.epsilon > rand_num:
            # Select random action - explore
            return self.env.action_space.sample()
        else:
            # Select best action
            return self.select_greedy_action(obs)

    def add_to_buffer(self, obs, action, new_obs, reward, done):
        transition = self.Transition(state=obs,
                                     action=action,
                                     next_state=new_obs,
                                     reward=reward,
                                     done=done)
        self.replay_buffer.append(transition)
        self.num_iterations = self.num_iterations + 1
        if self.epsilon > self.eps_end:
            self.epsilon = self.eps_start - self.num_iterations / self.eps_decay
        self.total_reward = self.total_reward + reward

    def sample_and_improve(self, batch_size):
        # If buffer is big enough
        if len(self.replay_buffer.buffer) > self.replay_buffer.minimum:
            # Sample batch_size number of transitions from buffer B
            mini_batch = self.replay_buffer.sample(batch_size)
            # Calculate loss and improve NN
            loss = self.agent_control.improve(mini_batch)
            # So we can calculate mean of all loss during one game
            self.total_loss.append(loss)

        if (self.num_iterations % self.n_iter_update_nn) == 0:
            self.agent_control.update_target_nn()

    def reset_parameters(self):
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.num_games = self.num_games + 1
        self.total_loss = []

    def print_info(self):
        # print(self.num_iterations, self.ts_frame, time.time(), self.ts)
        fps = (self.num_iterations - self.ts_frame) / (time.time() - self.ts)
        print('%d %d rew:%d mean_rew:%.2f fps:%d, eps:%.2f, loss:%.4f' %
              (self.num_iterations, self.num_games, self.total_reward,
               np.mean(self.rewards[-40:]), fps, self.epsilon,
               np.mean(self.total_loss)))
        self.ts_frame = self.num_iterations
        self.ts = time.time()

        if self.summary_writer != None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.num_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.num_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.num_games)
            self.summary_writer.add_scalar('esilon', self.epsilon,
                                           self.num_games)
            self.summary_writer.add_scalar('loss', np.mean(self.total_loss),
                                           self.num_games)

        if self.tg_bot:
            if (self.num_games % 10) == 0:
                tg.info_msg(self.num_games + 1, self.max_games,
                            np.mean(self.rewards[-40:]),
                            np.mean(self.total_loss))
            if self.num_games == (self.max_games - 1):
                tg.end_msg(time.time() - self.birth_time)
Exemple #11
0
def train(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Obtain environment parameters.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space
    env.close()

    # Build networks.
    main_network = QValueNetwork(obs_space, action_space, name="main_network")
    target_network = QValueNetwork(obs_space,
                                   action_space,
                                   name="target_network",
                                   auxiliary_network=main_network)
    variables_initializer = tf.global_variables_initializer()

    # Create parallel environments.
    par_env = ParallelEnvironment(
        [make_atari(ENV_NAME) for _ in range(NUM_ENV)])

    replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE)
    start_time = time.time()
    list_episodic_reward = []
    episodic_reward = np.zeros(NUM_ENV)

    obs = par_env.reset()

    with tf.Session() as sess:
        # Initialize all variables.
        sess.run(variables_initializer)
        # Only save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)

        # Initialize buffers.
        while replay_buffer.get_size() < INITIAL_BUFFER_SIZE:
            # Sample random action.
            action = np.random.randint(action_space.n, size=NUM_ENV)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

        step = 0
        next_target_network_update_step = 0
        next_autosave_step = 0
        while step < TOTAL_STEP:
            # Synchronize the target network periodically (target network <- main network).
            if step >= next_target_network_update_step:
                sess.run(target_network.sync_op)
                next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP

            # Sample action with epsilon-greedy policy.
            epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(
                step / EPSILON_DECAY_STEP, 1)
            random_uniform = np.random.uniform(size=NUM_ENV)
            action = np.zeros(NUM_ENV, dtype=np.int32)
            random_action_index = np.argwhere(random_uniform < epsilon)
            if np.shape(random_action_index)[0] > 0:
                action[tuple(
                    np.transpose(random_action_index))] = np.random.randint(
                        action_space.n, size=np.shape(random_action_index)[0])
            greedy_action_index = np.argwhere(random_uniform >= epsilon)
            if np.shape(greedy_action_index)[0] > 0:
                q = sess.run(target_network.q,
                             feed_dict={
                                 target_network.Obs:
                                 np.array(obs)[tuple(
                                     np.transpose(greedy_action_index))] /
                                 255.0
                             })
                action[tuple(np.transpose(greedy_action_index))] = np.argmax(
                    q, axis=1)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    list_episodic_reward.append((step, episodic_reward[i]))
                    delta_time = int(time.time() - start_time)
                    print("Step ",
                          step,
                          "/",
                          TOTAL_STEP,
                          ": Time spent = ",
                          delta_time,
                          " s , Episodic reward = ",
                          episodic_reward[i],
                          sep="")
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

            # Learning rate.
            lr = LEARNING_RATE[-1]
            for i in range(len(LR_ANNEAL_STEP)):
                if step < LR_ANNEAL_STEP[i]:
                    lr = LEARNING_RATE[i]
                    break

            for _ in range(NUM_ENV):
                # Sample training data from the replay buffer.
                batch_data = replay_buffer.sample(BATCH_SIZE)
                batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
                  [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]

                # Compute the target Q value:
                #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
                q_next = sess.run(
                    target_network.q,
                    feed_dict={target_network.Obs: batch_obs_next / 255.0})
                max_qnext = np.amax(q_next, axis=1)
                target_q = batch_reward + (
                    1 - batch_done) * REWARD_DISCOUNT * max_qnext

                # Update the main network.
                sess.run(main_network.train_op,
                         feed_dict={
                             main_network.Obs: batch_obs / 255.0,
                             main_network.Action: batch_action,
                             main_network.TargetQ: target_q,
                             main_network.LR: lr
                         })

            # Save the main network periodically.
            if step >= next_autosave_step:
                saver.save(sess, SAVE_DIR + file_name)
                next_autosave_step += AUTOSAVE_STEP

            # Update step.
            step += NUM_ENV

        # Save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)
        saver.save(sess, SAVE_DIR + file_name)

    total_time = int(time.time() - start_time)
    print("Training finished in ", total_time, " s.", sep="")

    # Close the environment.
    par_env.close()

    # Plot the episodic reward against training step curve.
    plot_episodic_reward(list_episodic_reward, file_name)
Exemple #12
0
class DQNAgent:
    def __init__(self,
                 env,
                 QNetworkClass,
                 minibatch_size_limit=32,
                 replay_memory_size=1000000,
                 history_length=4,
                 target_update_step=10000,
                 discount_factor=0.99,
                 learning_rate=0.00025,
                 initial_exploration=1.0,
                 final_exploration=0.1,
                 final_exploration_frame=1000000,
                 replay_start_size=50000,
                 log_dir=None):
        self.env = env
        self.new_episode()

        self.n_actions = env.action_space.n
        self.n_states = env.observation_space.shape[0]
        self.n_input = self.n_states * history_length

        #setup tensorflow
        self.sess = tf.Session()

        self.q = QNetworkClass("q_orig", self.n_input, self.n_actions,
                               learning_rate)
        self.q_hat = QNetworkClass("q_hat", self.n_input, self.n_actions)

        self.total_reward = tf.placeholder(tf.float32)
        tf.summary.scalar("TotalReward", self.total_reward)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(max_to_keep=0)
        self.summary = tf.summary.merge_all()

        if tf.gfile.Exists(log_dir):
            tf.gfile.DeleteRecursively(log_dir)
        tf.gfile.MakeDirs(log_dir)

        if log_dir:
            self.log_writer = tf.summary.FileWriter(log_dir,
                                                    self.sess.graph,
                                                    flush_secs=20)
        else:
            self.log_writer = None
        #store parameter
        self.minibatch_size_limit = minibatch_size_limit
        self.gamma = discount_factor

        self.replay_buffer = ReplayBuffer(replay_memory_size)

        self.target_update_step = target_update_step
        self.step = 0

        self.phi_t = np.zeros((1, self.n_input)). \
                                astype(np.float32)

        self.epsilon = initial_exploration
        self.replay_start_size = replay_start_size
        self.final_exploration = final_exploration
        self.epsilon_step = (initial_exploration - final_exploration) \
                            / final_exploration_frame

    def act(self):
        a_t = np.argmax(self._perform_q(self.phi_t))

        s_t_1, r_t, terminal, _ = self.env.step(a_t)
        phi_t_1 = np.hstack(
            (self.phi_t[:, self.n_states:], s_t_1.astype(np.float32).reshape(
                (1, -1))))
        self.phi_t = phi_t_1

        return a_t, s_t_1, r_t, terminal, {'epsilon': self.epsilon}

    def act_and_train(self):
        # With probability epsilon select a random action
        # Otherwise select acionn from Q network
        if random.random() <= self.epsilon:
            a_t = random.randint(0, self.n_actions - 1)
        else:
            a_t = np.argmax(self._perform_q(self.phi_t))

        # Execute action in emulator and observe reward and state
        s_t_1, r_t, terminal, _ = self.env.step(a_t)
        phi_t_1 = np.hstack(
            (self.phi_t[:, self.n_states:], s_t_1.astype(np.float32).reshape(
                (1, -1))))

        # Store transition
        self.replay_buffer.append([self.phi_t, a_t, r_t, phi_t_1, terminal])
        self.phi_t = phi_t_1

        # After specified steps start experienced replay to update Q network
        if self.step >= self.replay_start_size:
            # sample minibatch
            y = np.zeros((0, self.n_actions))
            phi = np.zeros((0, self.n_input))
            minibatch = self.replay_buffer.sample(self.minibatch_size_limit)

            for phi_j, a_j, r_j, phi_j_1, terminal_j in minibatch:
                y_j = self._perform_q(phi_j)[0]
                if terminal_j:
                    y_j[a_j] = r_j
                else:
                    # DDQN
                    a = np.argmax(self._perform_q(phi_j_1))
                    y_j[a_j] = r_j + self.gamma * self._perform_q_hat(phi_j_1)[
                        0, a]
                    # DQN
                    #y_j[a_j] = r_j + self.gamma * np.max(self._perform_q_hat(phi_j_1))
                y = np.vstack((y, y_j))
                phi = np.vstack((phi, phi_j))

            # Update Q network #TODO comversion to numpy array should be done in q network class
            self._train_q(np.array(phi, dtype=np.float32),
                          np.array(y, dtype=np.float32))

            # Update target Q network every specific steps
            if self.step % self.target_update_step == 0:
                self._update_q_hat()

            # Update Exploration ratio
            if self.epsilon > self.final_exploration:
                self.epsilon -= self.epsilon_step

        self.step += 1

        return a_t, s_t_1, r_t, terminal, {'epsilon': self.epsilon}

    def new_episode(self):
        self.env.reset()

    def write_summary(self, episode, total_reward):
        summary = self.sess.run(
            self.summary,
            feed_dict={self.total_reward: np.array(total_reward)})
        self.log_writer.add_summary(summary, episode)

    def save_variables(self, step, model_dir=None):
        if model_dir:
            if not tf.gfile.Exists(model_dir):
                tf.gfile.MakeDirs(model_dir)
            full_path = os.path.join(model_dir, 'model')
            self.saver.save(self.sess, full_path, global_step=step)
            print('save model to ' + full_path)

    def restore_variables(self, model_path=None):
        if model_path:
            self.saver.restore(self.sess, model_path)
            print('Restore model from ' + model_path)

    def _perform_q(self, x):
        return self.q(self.sess, x)

    def _perform_q_hat(self, x):
        return self.q_hat(self.sess, x)

    def _train_q(self, x, t):
        self.q.train(self.sess, x, t)

    def _update_q_hat(self):
        self.q_hat.set_variables(self.sess, self.q.read_variables(self.sess))

    def __del__(self):
        if self.log_writer:
            self.log_writer.close()
Exemple #13
0
def ddpg_training(plt, args=None):

    print("Using {} environment.".format(env.spec.id))
    print('observation space {} '.format(env.observation_space))
    print('action space {} high {} low {}'.format(env.action_space,
                                                  env.action_space.high,
                                                  env.action_space.low))
    critic.summary()
    actor.summary()

    # create target networks
    criticp = keras.models.clone_model(critic)
    criticp.compile(optimizer='adam', loss='mse')
    criticp.set_weights(critic.get_weights())
    actorp = keras.models.clone_model(actor)
    actorp.compile(optimizer='adam', loss='mse')
    actorp.set_weights(actor.get_weights())

    #allocate replay buffers
    replay_buffer = ReplayBuffer(Config.buffer_length,
                                 env.observation_space.shape,
                                 env.action_space.shape)

    #set up the plotting - imports must be here to enable matplotlib.use()
    plt.ion()
    from util import ToggleFlags
    from display import display_progress

    flags = ToggleFlags(args)
    flags.add('noise', True)
    flags.add('render', False)
    flags.add('clear')
    flags.add('viz', True)
    flags.add('movie', True)
    flags.add('trails', False)

    RewardsHistory = []
    Rdfr = np.zeros((Config.buffer_length, ))
    episodes = []
    epoches = int(Config.buffer_length / Config.batch_size)

    for i_episode in range(Config.max_episodes):
        observation1 = env.reset()
        episode = []
        RewardsHistory.append(0)
        for t in range(Config.max_steps):
            episode.append(replay_buffer.index)
            #take step using the action based on actor
            observation = observation1
            action = actor.predict(np.expand_dims(observation, axis=0))[0]
            if flags.noise: action += exploration.sample()
            observation1, reward, done, _ = env.step(action)
            if len(observation1.shape) > 1 and observation1.shape[-1] == 1:
                observation1 = np.squeeze(observation1, axis=-1)

            # insert into replay buffer
            replay_buffer.append(observation, action, reward, observation1,
                                 done)

            #book keeping
            RewardsHistory[-1] += reward
            if flags.render: env.render()
            if done: break
            if replay_buffer.index == 0:
                episodes = []  #forget old episodes to avoid wraparound

        if replay_buffer.ready:
            for epoch in range(epoches):
                sample = replay_buffer.sample(Config.batch_size)
                # train critic on discounted future rewards
                yq = (replay_buffer.reward[sample] + Config.gamma *
                      (criticp.predict([
                          replay_buffer.obs1[sample],
                          actorp.predict(replay_buffer.obs1[sample])
                      ])[:, 0]))
                critic.train_on_batch(
                    [replay_buffer.obs[sample], replay_buffer.action[sample]],
                    yq)

                # train the actor to maximize Q
                if i_episode > Config.warmup:
                    actor.train_on_batch(
                        replay_buffer.obs[sample],
                        np.zeros((Config.batch_size, *actor.output_shape[1:])))

                # update target networks
                criticp.set_weights([
                    Config.tau * w + (1 - Config.tau) * wp for wp, w in zip(
                        criticp.get_weights(), critic.get_weights())
                ])
                actorp.set_weights([
                    Config.tau * w + (1 - Config.tau) * wp
                    for wp, w in zip(actorp.get_weights(), actor.get_weights())
                ])
        if flags.clear:
            episodes = []
        episodes.append(episode)
        if len(episode) > 2 and Config.show_progress:
            display_progress(replay_buffer, flags, plt, RewardsHistory, Rdfr,
                             env, episode, episodes, i_episode, actor, actorp,
                             critic, criticp)

        if Config.save_model and i_episode % 100 == 0:
            print("Save models")
            actor.save('actor.h5')
            critic.save('critic.h5')
        print("Episode {} finished after {} timesteps total reward={}".format(
            i_episode, t + 1, RewardsHistory[-1]))
Exemple #14
0
class Trainer_Clipped(object):
    def __init__(self, agent, env):
        self.agent = agent
        self.env = env
        self.seed = random.randint(0, 20180818)
        self.optimizer1 = optim.Adam(agent.parameters1, lr=LEARNING_RATE)
        self.optimizer2 = optim.Adam(agent.parameters2, lr=LEARNING_RATE)
        self.buffer = ReplayBuffer(capacity=CAPACITY)
        self.total_step = 0

    def run(self, device='cpu', buffer=False, explore=False):
        """Run an episode and buffer"""
        self.env.reset()
        self.env.env.seed(self.seed)
        state = self.env.get_screen()
        states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84)
        step = 0
        accumulated_reward = 0
        while True:
            action = self.agent.make_action1(torch.Tensor([states]).to(device), explore=explore)
            state_next, reward, done = self.env.step(action)
            states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0)
            step += 1
            accumulated_reward += reward
            if buffer:
                self.buffer.append(states, action, reward, states_next, done)
            states = states_next
            if explore == False:
                # Render the screen to see training
                self.env.env.render()
            if done:
                break
        return accumulated_reward, step

    def _fill_buffer(self, num, device='cpu'):
        start = time.time()
        while self.buffer.size < num:
            self.run(device, buffer=True, explore=True)
            print('Fill buffer: {}/{}'.format(self.buffer.size, self.buffer.capacity))
        print('Filling buffer takes {:.3f} seconds'.format(time.time() - start))

    def train(self, device='cpu'):
        # Sp
        self.env.change_record_every_episode(100000000)
        self._fill_buffer(OBSERV, device)
        if self.env.record_every_episode:
            self.env.change_record_every_episode(self.env.record_every_episode)

        episode = 0
        total_accumulated_rewards = []
        while 'training' != 'converge':
        #while episode <= 500:
            self.env.reset()
            state = self.env.get_screen()
            states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84)
            step_prev = self.total_step
            accumulated_reward = 0
            done = False
            n_flap = 0
            n_none = 0
            while not done:
                #### --------------------
                #### Add a new transition
                # Calculates actions based on e-greedy
                action = self.agent.make_action1(torch.Tensor([states]).to(device), explore=True)

                state_next, reward, done = self.env.step(action)

                states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0)
                self.total_step += 1
                accumulated_reward += reward
                self.buffer.append(states, action, reward, states_next, done)
                states = states_next
                #### --------------------

                #### --------------------
                #### Training step
                start = time.time()
                # prepare training data
                minibatch = self.buffer.sample(n_sample=BATCH)
                _states = [b[0] for b in minibatch]
                _actions = [b[1] for b in minibatch]
                _rewards = [b[2] for b in minibatch]
                _states_next = [b[3] for b in minibatch]
                _dones = [b[4] for b in minibatch]

                ys = []
                for i in range(len(minibatch)):
                    terminal = _dones[i]
                    r = _rewards[i]
                    if terminal:
                        y = r
                    else:
                        # Double DQN
                        # Nessa parte usamos a e_greedy para tomar a ação, ou sempre nos baseamos no argmax da propria rede?
                        s_t_next = torch.Tensor([_states_next[i]]).to(device)
                        # Calculates de action with self.net1
                        online_act1 = self.agent.make_action1(s_t_next)
                        # Calculates de max value for online_act1
                        max_value1 = self.agent.Q1(s_t_next, online_act1, target=True)
                        # Calculates de action with self.net2
                        # online_act2 = self.agent.make_action2(s_t_next)
                        # Calculates de max value for online_act2
                        max_value2 = self.agent.Q2(s_t_next, online_act1, target=True)
                        # Index 0 network1, Index 1 network 2
                        max_values = [max_value1,max_value2]
                        index = np.argmin(np.asarray(max_values))
                        # Calculates the total reward with the target network using the action calculated form the other network (self.net)
                        # Both network1 and network 2 shares the same target
                        y = r + DISCOUNT * max_values[index]
                    ys.append(y)
                ys = torch.Tensor(ys).to(device)

                # Render the screen to see training
                #self.env.env.render()

                # Apply gradient on network 1
                #print('Traning network 1...')
                self.optimizer1.zero_grad()
                input = torch.Tensor(_states).to(device)

                output1 = self.agent.net1(input) # shape (BATCH, 2)

                actions_one_hot = np.zeros([BATCH, 2])
                actions_one_hot[np.arange(BATCH), _actions] = 1.0
                actions_one_hot = torch.Tensor(actions_one_hot).to(device)
                ys_hat = (output1 * actions_one_hot).sum(dim=1)
                loss1 = F.smooth_l1_loss(ys_hat, ys)
                loss1.backward()
                self.optimizer1.step()

                # Apply gradient on network 2
                #print('Traning network 2...')
                self.optimizer2.zero_grad()
                input = torch.Tensor(_states).to(device)

                output2 = self.agent.net2(input) # shape (BATCH, 2)

                actions_one_hot = np.zeros([BATCH, 2])
                actions_one_hot[np.arange(BATCH), _actions] = 1.0
                actions_one_hot = torch.Tensor(actions_one_hot).to(device)
                ys_hat = (output2 * actions_one_hot).sum(dim=1)
                loss2 = F.smooth_l1_loss(ys_hat, ys)
                loss2.backward()
                self.optimizer2.step()
                #### --------------------

                # logging
                if action == 0:
                    n_flap += 1
                else:
                    n_none += 1

                if done and self.total_step % LOGGING_CYCLE == 0:
                    log = '[{}, {}] alive: {}, reward: {}, F/N: {}/{}, loss1: {:.4f}, loss2: {:.4f}, epsilon: {:.4f}, time: {:.3f}, network: Q{}'.format(
                        episode,
                        self.total_step,
                        self.total_step - step_prev,
                        accumulated_reward,
                        n_flap,
                        n_none,
                        loss1.item(),
                        loss2.item(),
                        self.agent.epsilon,
                        time.time() - start,
                        index+1)
                    print(log)

                self.agent.update_epsilon()
                if self.total_step % TARGET_UPDATE_CYCLE == 0:
                    #print('[Update target network]')
                    self.agent.update_targets()

                if self.total_step % SAVE_MODEL_CYCLE == 0:
                    print('[Save model]')
                    self.save(id=self.total_step)
                    if len(total_accumulated_rewards) > 0:
                        self.save_graph_rewards(episode, total_accumulated_rewards)

            # Keep the accumulated_reward for all the episodes
            total_accumulated_rewards.append(accumulated_reward)
            episode += 1

    def save_graph_rewards(self, episodes, total_accumulated_rewards):
        #fig = plt.figure()
        fig, ax = plt.subplots(figsize=(5, 5))
        plt.xlabel('Episodes')
        plt.ylabel('Total reward')
        episodes_x = np.linspace(0, episodes, episodes)
        ax.plot(episodes_x, np.ones(episodes)*0, color='red', label='ref')
        ax.plot(episodes_x, total_accumulated_rewards, color='turquoise', label='real')
        ax.legend(loc='lower left')
        if not os.path.exists('tmp/graphs'):
            os.makedirs('tmp/graphs')
        plt.savefig(f'tmp/graphs/Total_rewards_ep={episodes}.png')
        plt.close()


    def save(self, id):
        filename = 'tmp/models1/model_{}.pth.tar'.format(id)
        dirpath = os.path.dirname(filename)
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        checkpoint = {
            'net': self.agent.net1.state_dict(),
            'target': self.agent.target1.state_dict(),
            'optimizer': self.optimizer1.state_dict(),
            'total_step': self.total_step
        }
        torch.save(checkpoint, filename)

        # Only saves Q1
        filename = 'tmp/models2/model_{}.pth.tar'.format(id)
        dirpath = os.path.dirname(filename)
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        checkpoint = {
            'net': self.agent.net2.state_dict(),
            'target': self.agent.target2.state_dict(),
            'optimizer': self.optimizer2.state_dict(),
            'total_step': self.total_step
        }
        torch.save(checkpoint, filename)

    def load(self, filename, device='cpu'):
        # FAZER AJUSTES DEPOIS PARA EVALUATE
        # LEMBRAR DO USO DE DUAS REDES, LOGO DOIS OPTIMIZERS
        ckpt = torch.load(filename, map_location=lambda storage, loc: storage)
        ## Deal with the missing of bn.num_batches_tracked
        net_new = OrderedDict()
        tar_new = OrderedDict()

        for k, v in ckpt['net'].items():
            for _k, _v in self.agent.net1.state_dict().items():
                if k == _k:
                    net_new[k] = v

        for k, v in ckpt['target'].items():
            for _k, _v in self.agent.target1.state_dict().items():
                if k == _k:
                    tar_new[k] = v

        self.agent.net1.load_state_dict(net_new)
        self.agent.target1.load_state_dict(tar_new)
        ## -----------------------------------------------

        self.optimizer1.load_state_dict(ckpt['optimizer'])
        self.total_step = ckpt['total_step']
Exemple #15
0
def train(
        args,
        log_dir,
        seed,
        env_id,
        replay_buffer_len,
        memory_len,
        cores,
        trees,
        p,  # #nn items; reported number is 50
        embed_size,  # embedding vector length; reported number is ?
        gamma,  # discount value; reported number is 0.99
        N,  # N-step bootstrapping; reported number is 100
        update_period,  # the reported number is 16//4 = 4
        batch_size,  # the reported number is 32
        init_eps,
        delta,
        lr,
        q_lr,
        epsilon,
        min_epsilon,
        epsilon_decay,  #exponential decaying factor
        eval_period,
        save_period,
        **kwargs):
    # another hyper params
    _gw = np.array([gamma**i for i in range(N)])

    # expr setting
    Path(log_dir).mkdir(parents=True, exist_ok='temp' in log_dir)

    with open(os.path.join(log_dir, 'args.txt'), 'w') as f:
        f.write(str(args))

    np.random.seed(seed)
    tf.random.set_random_seed(seed)

    # Env
    env = wrap_deepmind(make_atari(env_id),
                        episode_life=False,
                        clip_rewards=False,
                        frame_stack=True,
                        scale=False)
    num_ac = env.action_space.n

    # ReplayBuffer
    replay_buffer = ReplayBuffer(replay_buffer_len)

    # Neural Episodic Controller
    nec = NEC(
        num_ac,
        p,
        embed_size,
        delta,
        lr,
        q_lr,
        dnd_params={
            'maxlen': memory_len,
            'seed': seed,
            'cores': cores,  # #cores for KD-Tree
            'trees': trees,  # #trees for KD-Tree
        })

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(os.path.join(
        log_dir, 'tensorboard'))

    def _write_scalar(it, it_type, tag, value):
        summary = tf.Summary(value=[
            tf.Summary.Value(tag=f"{tag}/{it_type}", simple_value=value)
        ])
        summary_writer.add_summary(summary, global_step=it)

    ####### Setup Done

    num_steps = 0
    num_updates = 0

    # Fill up the memory and replay buffer with a random policy
    for ep in range(init_eps):
        ob = env.reset()

        obs, acs, rewards = [ob], [], []
        for _ in itertools.count():
            ac = np.random.randint(num_ac)

            ob, r, done, _ = env.step(ac)

            obs.append(ob)
            acs.append(ac)
            rewards.append(r)

            num_steps += 1

            if done:
                break

        Rs = [
            np.sum(_gw[:len(rewards[i:i + N])] * rewards[i:i + N])
            for i in range(len(rewards))
        ]

        obs = np.array(obs)
        es = nec._embed(obs)

        for ob, e, a, R in zip(obs, es, acs, Rs):
            nec.append(e, a, R)

            replay_buffer.append(ob, a, R)

    # Training!
    next_save_steps = save_period
    try:
        for ep in itertools.count(start=init_eps):
            ob = env.reset()

            obs, acs, rewards, es, Vs = [ob], [], [], [], []
            for t in itertools.count():
                # Epsilon Greedy Policy
                ac, (e, V) = nec.policy(ob)
                if np.random.random() < epsilon:
                    ac = np.random.randint(num_ac)

                ob, r, done, _ = env.step(ac)

                obs.append(ob)
                acs.append(ac)
                rewards.append(r)
                es.append(e)
                Vs.append(V)

                num_steps += 1

                # Train on random minibatch from replacy buffer
                if num_steps % update_period == 0:
                    b_s, b_a, b_R = replay_buffer.sample(batch_size)
                    loss = nec.update(b_s, b_a, b_R)

                    num_updates += 1

                    if num_updates % 100 == 0:
                        print(f'[{num_steps*4}/{num_updates}] loss: {loss}')

                    _write_scalar(it=num_steps * 4,
                                  it_type='per_frames',
                                  tag='loss',
                                  value=loss)
                    _write_scalar(it=num_updates,
                                  it_type='per_updates',
                                  tag='loss',
                                  value=loss)
                    _write_scalar(it=num_steps * 4,
                                  it_type='per_frames',
                                  tag='num_updates',
                                  value=num_updates)

                if t >= N:
                    # N-Step Bootstrapping
                    # TODO: implement the efficient version
                    R = np.sum(
                        _gw * rewards[t - N:t]) + (gamma**N) * Vs[t]  #R_{t-N}

                    # append to memory
                    nec.append(es[t - N], acs[t - N], R)

                    # append to replay buffer
                    replay_buffer.append(obs[t - N], acs[t - N], R)

                if done:
                    break

            print(
                f'Episode {ep} -- Ep Len: {len(obs)} Acc Reward: {np.sum(rewards)} current epsilon: {epsilon}'
            )
            _write_scalar(tag='ep',
                          value=ep,
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='ep_len',
                          value=len(obs),
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='ep_len',
                          value=len(obs),
                          it=ep,
                          it_type='per_episode')
            _write_scalar(tag='eps_reward',
                          value=np.sum(rewards),
                          it=num_steps * 4,
                          it_type='per_frames')
            _write_scalar(tag='eps_reward',
                          value=np.sum(rewards),
                          it=ep,
                          it_type='per_episode')
            _write_scalar(tag='epsilon',
                          value=epsilon,
                          it=ep,
                          it_type='per_episode')

            # Remaining items which is not bootstrappable; partial trajectory close to end of episode
            # Append to memory & replay buffer
            for t in range(len(rewards) - N, len(rewards)):
                R = np.sum([
                    gamma**(i - t) * rewards[i]
                    for i in range(t, len(rewards))
                ])
                nec.append(es[t], acs[t], R)
                replay_buffer.append(obs[t], acs[t], R)

            # epsilon decay
            epsilon = max(min_epsilon, epsilon * epsilon_decay)

            # Save Model & Evaluatate
            if ep % eval_period == 0:
                try:
                    ep_len, eps_reward = _run(env,
                                              nec,
                                              os.path.join(
                                                  log_dir, f'test-{ep}.mp4'),
                                              maxlen=len(obs) * 3)

                    print(
                        f'Evaluation -- Episode {ep} -- Ep Len: {ep_len} Acc Reward: {eps_reward}'
                    )
                    _write_scalar(tag='ep_len',
                                  value=ep_len,
                                  it=ep,
                                  it_type='per_episode_eval')
                    _write_scalar(tag='eps_reward',
                                  value=eps_reward,
                                  it=ep,
                                  it_type='per_episode_eval')
                except RuntimeError as e:
                    print(e)
                    print('Evaluation -- Skipped')

            if num_steps >= next_save_steps:
                nec.save(log_dir, it=next_save_steps *
                         4)  # iteration number -- num frames
                next_save_steps += save_period

    except KeyboardInterrupt:
        print('saving... please wait...')
        nec.save(log_dir)
        print('done!')
Exemple #16
0
max_steps = 500
batch_size = 64
frame_idx = 0
latest_10_returns = deque(maxlen=10)

while True:
    state = env.reset()
    ou_noise.reset()
    episode_reward = 0
    loss_act, loss_cri = 0, 0

    for t in range(max_steps):
        action = act_net.get_action([state])
        action = ou_noise.get_action(action, t)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.append(state, action, reward, next_state, done)
        state = next_state
        frame_idx += 1
        if len(replay_buffer) > batch_size:
            loss_act, loss_cri = ddpg_update(batch_size)
        episode_reward += reward
        if done:
            break
    latest_10_returns.append(episode_reward)
    mean_return = np.mean(latest_10_returns)
    if frame_idx % 500 == 0:
        print(
            'Frame_idx: %d, loss_act: %.3f, loss_cri: %.3f, mean_return: %.3f'
            % (frame_idx, loss_act, loss_cri, float(mean_return)))
    if mean_return > -300:
        torch.save(act_net.state_dict(), identity + '_act.pth')
class Brain:
    """
    The Brain that contains all the models
    """

    def __init__(self, num_states, num_actions, action_high, action_low, gamma=GAMMA, rho=RHO,
                 std_dev=STD_DEV):
        # initialize everything
        self.actor_network = ActorNetwork(num_states, num_actions, action_high)
        self.critic_network = CriticNetwork(num_states, num_actions, action_high)
        self.actor_target = ActorNetwork(num_states, num_actions, action_high)
        self.critic_target = CriticNetwork(num_states, num_actions, action_high)

        # Making the weights equal initially
        self.actor_target.set_weights(self.actor_network.get_weights())
        self.critic_target.set_weights(self.critic_network.get_weights())

        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        self.gamma = tf.constant(gamma)
        self.rho = rho
        self.action_high = action_high
        self.action_low = action_low
        self.num_states = num_states
        self.num_actions = num_actions
        self.noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

        # optimizers
        self.critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR, amsgrad=True)
        self.actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR, amsgrad=True)

        # temporary variable for side effects
        self.cur_action = None

        # define update weights with tf.function for improved performance
        @tf.function(
            input_signature=[
                tf.TensorSpec(shape=(None, num_states), dtype=tf.float32),
                tf.TensorSpec(shape=(None, num_actions), dtype=tf.float32),
                tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
                tf.TensorSpec(shape=(None, num_states), dtype=tf.float32),
                tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            ])
        def update_weights(s, a, r, sn, d):
            """
            Function to update weights with optimizer
            """
            with tf.GradientTape() as tape:
                # define target
                y = r + self.gamma * (1 - d) * self.critic_target([sn, self.actor_target(sn)])
                # define the delta Q
                critic_loss = tf.math.reduce_mean(tf.math.abs(y - self.critic_network([s, a])))
            critic_grad = tape.gradient(critic_loss, self.critic_network.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic_network.trainable_variables))

            with tf.GradientTape() as tape:
                # define the delta mu
                actor_loss = -tf.math.reduce_mean(self.critic_network([s, self.actor_network(s)]))
            actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor_network.trainable_variables))
            return critic_loss, actor_loss

        self.update_weights = update_weights

    def act(self, state, _notrandom=True, noise=True):
        """
        Run action by the actor network

        Args:
            state: the current state
            _notrandom: whether greedy is used
            noise: whether noise is to be added to the result action (this improves exploration)

        Returns:
            the resulting action
        """
        self.cur_action = (self.actor_network(state)[0].numpy()
                           if _notrandom
                           else (np.random.uniform(self.action_low, self.action_high,
                                                   self.num_actions)) +
                                (self.noise() if noise else 0))
        self.cur_action = np.clip(self.cur_action, self.action_low, self.action_high)
        maxQ = max(self.critic_network([state, self.actor_network(state)])).numpy()[0]

        return self.cur_action, maxQ

    def remember(self, prev_state, reward, state, done):
        """
        Store states, reward, done value to the buffer
        """
        # record it in the buffer based on its reward
        self.buffer.append(prev_state, self.cur_action, reward, state, done)

    def learn(self, entry):
        """
        Run update for all networks (for training)
        """
        s, a, r, sn, d = zip(*entry)

        c_l, a_l = self.update_weights(tf.convert_to_tensor(s, dtype=tf.float32),
                                       tf.convert_to_tensor(a, dtype=tf.float32),
                                       tf.convert_to_tensor(r, dtype=tf.float32),
                                       tf.convert_to_tensor(sn, dtype=tf.float32),
                                       tf.convert_to_tensor(d, dtype=tf.float32))

        update_target(self.actor_target, self.actor_network, self.rho)
        update_target(self.critic_target, self.critic_network, self.rho)

        return c_l, a_l

    def save_weights(self, path):
        """
        Save weights to `path`
        """
        parent_dir = os.path.dirname(path)
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
        # Save the weights
        self.actor_network.save_weights(path + "an.h5")
        self.critic_network.save_weights(path + "cn.h5")
        self.critic_target.save_weights(path + "ct.h5")
        self.actor_target.save_weights(path + "at.h5")

    def load_weights(self, path):
        """
        Load weights from path
        """
        try:
            self.actor_network.load_weights(path + "an.h5")
            self.critic_network.load_weights(path + "cn.h5")
            self.critic_target.load_weights(path + "ct.h5")
            self.actor_target.load_weights(path + "at.h5")
        except OSError as err:
            logging.warning("Weights files cannot be found, %s", err)