Esempio n. 1
0
    def __init__(self, config, scope='memory_agent', network_builder=None):
        """
        Initialize a vanilla DQN agent as described in
        http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html.

        :param config: Configuration parameters for agent
        :param scope: TensorFlow scope
        """
        self.config = create_config(config, default=self.default_config)
        self.model = None

        self.memory = ReplayMemory(**self.config)
        self.step_count = 0
        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config,
                                              scope,
                                              network_builder=network_builder)
Esempio n. 2
0
    def test_dqn_agent(self):
        config = {
            'seed': 10,
            'batch_size': 16,
            'state_shape': (2, ),
            'actions': 2,
            'action_shape': (),
            'update_rate': 1,
            'update_repeat': 4,
            'min_replay_size': 50,
            'memory_capacity': 50,
            "exploration": "epsilon_decay",
            "exploration_param": {
                "epsilon": 1,
                "epsilon_final": 0,
                "epsilon_states": 50
            },
            'target_network_update_rate': 1.0,
            'use_target_network': True,
            "alpha": 0.0005,
            "gamma": 0.99,
            "tau": 1.0
        }

        tf.reset_default_graph()
        tf.set_random_seed(10)

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 16
            }, {
                'type': 'linear',
                'num_outputs': 2
            }])
        agent = DQNAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in xrange(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = False
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                return

        assert (sum(rewards) == 100.0)
Esempio n. 3
0
    def test_vpg_agent(self):
        config = {
            'batch_size': 8,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2,),
            'actions': 2}
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 32}])

        agent = VPGAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state, action=action, reward=reward, terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
        self.assertTrue(False)
Esempio n. 4
0
    def __init__(self, config, scope, task_index, cluster_spec):
        self.config = create_config(config, default=self.default_config)
        self.current_episode = defaultdict(list)
        self.current_episode['terminated'] = False

        self.continuous = self.config.continuous
        self.current_experience = Experience(self.continuous)
        self.model = DistributedModel(config, scope, task_index, cluster_spec)
    def __init__(self, config):
        super(MemoryAgentTestModel, self).__init__(config, scope="testmodel")
        self.config = create_config(config, default={})

        self.actions = self.config.actions

        self.count_updates = 0
        self.count_target_updates = 0
Esempio n. 6
0
    def __init__(self, config, scope):
        self.config = create_config(config, default=self.default_config)
        self.model = None
        self.current_batch = []
        self.current_episode = defaultdict(list)
        self.batch_steps = 0
        self.batch_size = self.config.batch_size
        self.last_action = None
        self.last_action_means = None
        self.last_action_log_std = None
        self.continuous = self.config.continuous

        if self.model_ref:
            self.model = self.model_ref(self.config, scope)
Esempio n. 7
0
    def __init__(self, *args, **kwargs):
        """
        Initialize configuration using the default config. Then update the config first using *args (order is
        defined in self.config_args) and then using **kwargs)

        :param args: optional *args
        :param kwargs: optional **kwargs
        """
        self.config = create_config([], default=self.default_config)

        for i, arg in enumerate(args):
            if i >= len(self.config_args):
                break
            self.config.update({self.config_args[i]: arg})

        self.config.update(kwargs)
Esempio n. 8
0
    def test_trpo_agent(self):

        config = {
            'batch_size': 16,
            "override_line_search": False,
            "cg_iterations": 20,
            "use_gae": False,
            "normalize_advantage": False,
            "gae_lambda": 0.97,
            "cg_damping": 0.001,
            "line_search_steps": 20,
            'max_kl_divergence': 0.05,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2,),
            'actions': 2,
            'gamma': 0.99
        }

        config = create_config(config)
        tf.reset_default_graph()

        network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense',
                                                                 'num_outputs': 8}])
        agent = TRPOAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100

        for n in xrange(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state, action=action, reward=reward, terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                print('Steps until passed = {:d}'.format(n))

                return
        print('sum = {:f}'.format(sum(rewards)))
Esempio n. 9
0
    def __init__(self, config, scope):
        """
        
        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """
        self.session = tf.Session()
        self.total_states = 0
        self.saver = None
        self.config = create_config(config, default=self.default_config)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(log_levels[config.get('loglevel', 'info')])

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.scope = scope

        self.deterministic_mode = config.get('deterministic_mode', False)
        self.episode_length = tf.placeholder(tf.int32, (None, ),
                                             name='episode_length')

        self.learning_rate = config.get('learning_rate', 0.001)

        if self.config.seed is not None:
            self.random = global_seed(self.config.seed)
            tf.set_random_seed(self.config.seed)
        else:
            self.random = np.random.RandomState()

        optimizer = config.get('optimizer')
        if not optimizer:
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        else:
            args = config.get('optimizer_args', [])
            kwargs = config.get('optimizer_kwargs', {})
            optimizer_cls = get_function(optimizer)
            self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs)

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)
Esempio n. 10
0
    def __init__(self, config, scope='pg_agent', network_builder=None):
        self.config = create_config(config, default=self.default_config)
        assert issubclass(self.__class__.model, PGModel)
        self.model = self.__class__.model(self.config,
                                          scope,
                                          network_builder=network_builder)
        self.continuous = self.config.continuous

        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.current_batch = []
        self.batch_step = 0

        self.current_episode = self.model.zero_episode()
        self.episode_step = 0

        self.last_action = None
        self.last_action_means = None
        self.last_action_log_std = None
Esempio n. 11
0
    def test_trpo_agent(self):
        config = {
            'batch_size': 8,
            "cg_iterations": 20,
            "cg_damping": 0.001,
            "line_search_steps": 20,
            'max_kl_divergence': 0.01,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2, ),
            'actions': 2
        }
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 32
            }])
        agent = TRPOAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(100):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
Esempio n. 12
0
    def __init__(self, config, scope='dqfd_agent', network_builder=None):
        """
        
        :param config: 
        :param scope: 
        """
        self.config = create_config(config, default=self.default_config)

        # This is the online memory
        self.replay_memory = ReplayMemory(**self.config)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        # TODO we might want different sizes for these memories -> add config param
        self.demo_memory = ReplayMemory(**self.config)

        self.step_count = 0

        # Called p in paper, controls ratio of expert vs online training samples
        self.expert_sampling_ratio = self.config.expert_sampling_ratio

        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size

        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(self.expert_sampling_ratio * self.batch_size / \
                               (1.0 - self.expert_sampling_ratio))
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config,
                                              scope,
                                              network_builder=network_builder)
Esempio n. 13
0
    def __init__(self, config, scope):
        """

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        # TODO move several default params up here
        self.session = tf.Session()
        self.total_states = 0
        self.saver = None
        self.config = create_config(config, default=self.default_config)

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.scope = scope
        self.batch_shape = [None]

        self.deterministic_mode = config.get('deterministic_mode', False)

        self.alpha = config.get('alpha', 0.001)

        optimizer = config.get('optimizer')
        if not optimizer:
            self.optimizer = tf.train.AdamOptimizer(self.alpha)
        else:
            args = config.get('optimizer_args', [])
            kwargs = config.get('optimizer_kwargs', {})
            optimizer_cls = get_function(optimizer)
            self.optimizer = optimizer_cls(self.alpha, *args, **kwargs)

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)
    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()
Esempio n. 15
0
    def test_dqfd_agent(self):

        config = {
            "expert_sampling_ratio": 0.01,
            "supervised_weight": 0.5,
            "expert_margin": 1,
            'batch_size': 8,
            'state_shape': (2, ),
            'actions': 2,
            'action_shape': (),
            'update_rate': 1,
            'update_repeat': 4,
            'min_replay_size': 20,
            'memory_capacity': 20,
            "exploration": "epsilon_decay",
            "exploration_param": {
                "epsilon": 0,
                "epsilon_final": 0,
                "epsilon_states": 0
            },
            'target_network_update_rate': 1.0,
            'use_target_network': True,
            "alpha": 0.00004,
            "gamma": 1,
            "tau": 1.0
        }

        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork. \
            layered_network(layers=[{'type': 'dense',
                                     'num_outputs': 16,
                                     'weights_regularizer': 'tensorflow.contrib.layers.python.layers.regularizers.l2_regularizer',
                                     'weights_regularizer_kwargs': {
                                         'scale': 0.01
                                     }
                                     }, {'type': 'linear', 'num_outputs': 2}])
        agent = DQFDAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100

        # First: add to demo memory
        for n in xrange(50):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = False
            agent.add_demo_observation(state=state,
                                       action=action,
                                       reward=reward,
                                       terminal=terminal)

        # Pre-train from demo data
        agent.pre_train(10000)

        # If pretraining worked, we should not need much more training
        for n in xrange(1000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = False

            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                print('Passed after steps = {:d}'.format(n))

                return
            print('sum = {:f}'.format(sum(rewards)))