Ejemplo n.º 1
0
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()
Ejemplo n.º 2
0
    def set_new_goal(self, goal):
        goal_raw = np.array(goal)
        self.grid_env = GridEnv()
        _ = self.grid_env.reset(goal_raw)
        self.state_features, self.step_data_cache = [], []

        feed_dict = {}
        self.net_state = self.sess.run(self.policy.train_ops['init_state'],
                                       feed_dict=feed_dict)
        self.net_state = dict(
            zip(self.policy.train_ops['state_names'], self.net_state))
Ejemplo n.º 3
0
class CMPRunner(object):
    def __init__(self, model_path):
        config = self._get_tf_config()
        args = get_cfg_defaults()

        self.tf_graph = tf.Graph()

        with self.tf_graph.as_default():
            self.policy = CMPPolicy(args, is_training=False, batch_norm_is_training=False,
                                    only_single_step_graph=True)

            self.sess = tf.Session(config=config)
            self.sess.run(self.policy.init_op)
            self.policy.saver_op.restore(self.sess, model_path)

    def set_new_goal(self, goal):
        goal_raw = np.array(goal)
        self.grid_env = GridEnv()
        _ = self.grid_env.reset(goal_raw)
        self.state_features, self.step_data_cache = [], []

        feed_dict = {}
        self.net_state = self.sess.run(self.policy.train_ops['init_state'],
                                       feed_dict=feed_dict)
        self.net_state = dict(
            zip(self.policy.train_ops['state_names'], self.net_state))

    def compute_action(self, image):
        f = self.grid_env.get_features()
        f['imgs'] = image.copy()
        f = self.grid_env.pre_features(f)
        f.update(self.net_state)

        self.state_features.append(f)
        feed_dict = prepare_feed_dict(self.policy.input_tensors['step'],
                                      self.state_features[-1])
        feed_dict[self.policy.train_ops['batch_norm_is_training_op']] = False

        kwargs = {}
        outs = self.sess.run([self.policy.train_ops['step'],
                              self.policy.train_ops['step_data_cache'],
                              self.policy.train_ops['updated_state']],
                             feed_dict=feed_dict, **kwargs)
        action_probs = np.expand_dims(outs[0], axis=1)
        action = [np.argmax(action_probs[0, 0, :])]
        self.step_data_cache.append(
            dict(zip(self.policy.train_ops['step_data_cache_names'], outs[1])))
        self.net_state = outs[2]

        assert (action_probs.shape[1] == 1)
        next_state = self.grid_env.take_action(action[0])
        self.net_state = dict(
            zip(self.policy.train_ops['state_names'], self.net_state))
        return action, next_state

    def _get_tf_config(self):
        config = tf.ConfigProto()
        config.device_count['GPU'] = 1
        config.gpu_options.allow_growth = True
        config.intra_op_parallelism_threads = 0
        config.inter_op_parallelism_threads = 0
        return config
Ejemplo n.º 4
0
import os
import numpy as np

from grid_env import GridEnv
from agent import Agent

NUM_EPISODE = 1000

if __name__ == "__main__":
    env = GridEnv()
    agent = Agent(env)

    for n_episode in range(NUM_EPISODE):
        state = env.reset()

        while True:
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_action = agent.get_action(next_state)

            agent.update_table(state, action, reward, next_state, next_action)
            state = next_state

            if done:
                break

        debug_str = ""
        for h in range(env.height):
            for w in range(env.width):
                debug_str += '****************'
            debug_str += "*\n"
Ejemplo n.º 5
0
from __future__ import division

import tensorflow as tf
import numpy as np 
import gym
import itertools
import collections
from grid_env import GridEnv
from lib import plotting
from matplotlib import pyplot as plt


env = GridEnv()
obs_dim, act_dim, grid_size = env.get_dimensions()


# Coordinated Exploration
class MultiAgent(): 

    def __init__(self, learning_rate=0.01): 

        self.reg_constant_1 = 0.0005 #0.0005 for hard env, 0.0002 for easy 
        self.reg_constant_2 = 0.0005
        with tf.variable_scope('agent1'): 

            # Define policy for agent 1
            self.lambda_1 = tf.placeholder(tf.float32, name='lambda_1')
            self.state_1 = tf.placeholder(tf.int32, [], "state_1")
            self.action_1 = tf.placeholder(dtype=tf.int32, name="action_1")
            self.target_1 = tf.placeholder(dtype=tf.float32, name="target_1")
            self.ratio_1 = tf.placeholder(dtype=tf.float32, name="ratio_1")
Ejemplo n.º 6
0
class DQN_agent():
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()

    def build_graph(self, scope='main'):
        with tf.variable_scope(scope):
            h = tf.layers.dense(self.inputs,
                                16,
                                activation=tf.nn.relu,
                                name="h")
            outputs = tf.layers.dense(h,
                                      self.env.num_actions,
                                      activation=tf.nn.softmax,
                                      name="outputs")

            # everything is now the same shape (batch_size, num_actions)
            # nonzero error only for selected actions
            action_mask = tf.one_hot(self.actions,
                                     self.env.num_actions,
                                     on_value=True,
                                     off_value=False)
            targets = tf.tile(tf.expand_dims(self.target_values, 1),
                              [1, self.env.num_actions])
            target_outputs = tf.where(
                action_mask, targets, outputs
            )  # takes target value where mask is true. takes outputs value otherwise

            td_error = target_outputs - outputs  # only one element in each row is non-zero
            weights = tf.tile(tf.expand_dims(self.is_weights, 1),
                              [1, self.env.num_actions
                               ])  # all 1s when not using priority replay
            weighted_td_error = weights * td_error  # element-wise multiplication

            loss = tf.reduce_sum(tf.square(weighted_td_error))
            update = tf.train.AdamOptimizer().minimize(loss)
        return outputs, update, td_error

    def train(self):
        steps_per_ep = np.zeros(episodes)
        for episode in range(episodes):
            print(episode)
            self.env.reset()
            state = self.env.state
            done = False
            num_steps = 0
            while not done:
                num_steps += 1
                action = self.get_eps_action(state, self.eps)
                next_state, reward, done, _ = self.env.step(action)
                self.replay.add((state, action, reward, next_state,
                                 done))  # store in experience replay

                # sample from experience replay
                if prioritized_replay:
                    beta = beta0 + episode * (
                        1 - beta0
                    ) / episodes  # linear annealing schedule for IS weights
                    states, actions, rewards, next_states, dones, weights, indices = self.replay.sample(
                        self.batch_size, beta)
                    self.net_update(states, actions, rewards, next_states,
                                    dones, weights, indices)  # qlearning
                else:
                    states, actions, rewards, next_states, dones = self.replay.sample(
                        self.batch_size)
                    self.net_update(states, actions, rewards, next_states,
                                    dones)  # qlearning

                # slowly update target network
                if num_steps % update_every == 0:
                    self.target_net_update()

                # sort max heap periodically
                if num_steps % sort_every == 0:
                    if prioritized_replay and replay_type == "ranked":
                        self.replay.sort()

                state = next_state
            steps_per_ep[episode] = num_steps
        return steps_per_ep

    # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/
    def target_net_update(self):
        # get sorted lists of parameters in each of the networks
        main_params = [
            t for t in tf.trainable_variables() if t.name.startswith("main")
        ]
        main_params = sorted(main_params, key=lambda v: v.name)
        target_params = [
            t for t in tf.trainable_variables() if t.name.startswith("target")
        ]
        target_params = sorted(target_params, key=lambda v: v.name)

        update_ops = []
        for main_v, target_v in zip(main_params, target_params):
            op = target_v.assign(main_v)
            update_ops.append(op)

        self.sess.run(update_ops)

    # minibatch qlearning
    def net_update(self,
                   states,
                   actions,
                   rewards,
                   next_states,
                   dones,
                   weights=None,
                   indices=None):
        not_dones = np.logical_not(dones)

        # create a shape (batch_size, ) array of target values
        target_values = rewards.astype(
            float)  # np.array of shape (batch_size, )
        next_inputs = next_states[
            not_dones]  # np.array of shape (#done, state_size)
        next_Qs = self.sess.run(self.Q_out_op,
                                {self.inputs: next_inputs
                                 })  # np.array of shape (#done, num_actions)
        max_Qs = np.max(next_Qs, axis=1)  # np.array of shape (#done,)
        target_values[not_dones] += gamma * max_Qs

        # if not using prioritized replay
        if weights is None:
            weights = np.ones(self.batch_size)

        # compute gradients and update parameters
        _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \
                {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights})

        # update priority replay priorities
        if indices is not None:
            td_error = td_error.ravel()[np.flatnonzero(
                td_error)]  # shape (batch_size, )
            self.replay.update_priorities(
                indices,
                np.abs(td_error) + 1e-3
            )  # add small number to prevent never sampling 0 error transitions

    # returns eps-greedy action with respect to Q
    def get_eps_action(self, state, eps):
        if self.env.np_random.uniform() < eps:
            action = self.env.sample()
        else:
            Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])})
            max_actions = np.where(np.ravel(Q) == Q.max())[0]
            action = self.env.np_random.choice(
                max_actions)  # to select argmax randomly
        return action