def __init__(self, config, scope='memory_agent', network_builder=None): """ Initialize a vanilla DQN agent as described in http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html. :param config: Configuration parameters for agent :param scope: TensorFlow scope """ self.config = create_config(config, default=self.default_config) self.model = None self.memory = ReplayMemory(**self.config) self.step_count = 0 self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope, network_builder=network_builder)
def test_dqn_agent(self): config = { 'seed': 10, 'batch_size': 16, 'state_shape': (2, ), 'actions': 2, 'action_shape': (), 'update_rate': 1, 'update_repeat': 4, 'min_replay_size': 50, 'memory_capacity': 50, "exploration": "epsilon_decay", "exploration_param": { "epsilon": 1, "epsilon_final": 0, "epsilon_states": 50 }, 'target_network_update_rate': 1.0, 'use_target_network': True, "alpha": 0.0005, "gamma": 0.99, "tau": 1.0 } tf.reset_default_graph() tf.set_random_seed(10) config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 16 }, { 'type': 'linear', 'num_outputs': 2 }]) agent = DQNAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in xrange(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return assert (sum(rewards) == 100.0)
def test_vpg_agent(self): config = { 'batch_size': 8, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2,), 'actions': 2} tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 32}]) agent = VPGAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return self.assertTrue(False)
def __init__(self, config, scope, task_index, cluster_spec): self.config = create_config(config, default=self.default_config) self.current_episode = defaultdict(list) self.current_episode['terminated'] = False self.continuous = self.config.continuous self.current_experience = Experience(self.continuous) self.model = DistributedModel(config, scope, task_index, cluster_spec)
def __init__(self, config): super(MemoryAgentTestModel, self).__init__(config, scope="testmodel") self.config = create_config(config, default={}) self.actions = self.config.actions self.count_updates = 0 self.count_target_updates = 0
def __init__(self, config, scope): self.config = create_config(config, default=self.default_config) self.model = None self.current_batch = [] self.current_episode = defaultdict(list) self.batch_steps = 0 self.batch_size = self.config.batch_size self.last_action = None self.last_action_means = None self.last_action_log_std = None self.continuous = self.config.continuous if self.model_ref: self.model = self.model_ref(self.config, scope)
def __init__(self, *args, **kwargs): """ Initialize configuration using the default config. Then update the config first using *args (order is defined in self.config_args) and then using **kwargs) :param args: optional *args :param kwargs: optional **kwargs """ self.config = create_config([], default=self.default_config) for i, arg in enumerate(args): if i >= len(self.config_args): break self.config.update({self.config_args[i]: arg}) self.config.update(kwargs)
def test_trpo_agent(self): config = { 'batch_size': 16, "override_line_search": False, "cg_iterations": 20, "use_gae": False, "normalize_advantage": False, "gae_lambda": 0.97, "cg_damping": 0.001, "line_search_steps": 20, 'max_kl_divergence': 0.05, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2,), 'actions': 2, 'gamma': 0.99 } config = create_config(config) tf.reset_default_graph() network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 8}]) agent = TRPOAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in xrange(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: print('Steps until passed = {:d}'.format(n)) return print('sum = {:f}'.format(sum(rewards)))
def __init__(self, config, scope): """ :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = tf.Session() self.total_states = 0 self.saver = None self.config = create_config(config, default=self.default_config) self.logger = logging.getLogger(__name__) self.logger.setLevel(log_levels[config.get('loglevel', 'info')]) # This is the scope used to prefix variable creation for distributed TensorFlow self.scope = scope self.deterministic_mode = config.get('deterministic_mode', False) self.episode_length = tf.placeholder(tf.int32, (None, ), name='episode_length') self.learning_rate = config.get('learning_rate', 0.001) if self.config.seed is not None: self.random = global_seed(self.config.seed) tf.set_random_seed(self.config.seed) else: self.random = np.random.RandomState() optimizer = config.get('optimizer') if not optimizer: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) else: args = config.get('optimizer_args', []) kwargs = config.get('optimizer_kwargs', {}) optimizer_cls = get_function(optimizer) self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs)
def __init__(self, config, scope='pg_agent', network_builder=None): self.config = create_config(config, default=self.default_config) assert issubclass(self.__class__.model, PGModel) self.model = self.__class__.model(self.config, scope, network_builder=network_builder) self.continuous = self.config.continuous self.batch_size = self.config.batch_size self.max_episode_length = min(self.config.max_episode_length, self.batch_size) self.current_batch = [] self.batch_step = 0 self.current_episode = self.model.zero_episode() self.episode_step = 0 self.last_action = None self.last_action_means = None self.last_action_log_std = None
def test_trpo_agent(self): config = { 'batch_size': 8, "cg_iterations": 20, "cg_damping": 0.001, "line_search_steps": 20, 'max_kl_divergence': 0.01, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2, ), 'actions': 2 } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 32 }]) agent = TRPOAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(100): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return
def __init__(self, config, scope='dqfd_agent', network_builder=None): """ :param config: :param scope: """ self.config = create_config(config, default=self.default_config) # This is the online memory self.replay_memory = ReplayMemory(**self.config) # This is the demonstration memory that we will fill with observations before starting # the main training loop # TODO we might want different sizes for these memories -> add config param self.demo_memory = ReplayMemory(**self.config) self.step_count = 0 # Called p in paper, controls ratio of expert vs online training samples self.expert_sampling_ratio = self.config.expert_sampling_ratio self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(self.expert_sampling_ratio * self.batch_size / \ (1.0 - self.expert_sampling_ratio)) self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope, network_builder=network_builder)
def __init__(self, config, scope): """ :param config: Configuration parameters :param scope: TensorFlow scope """ # TODO move several default params up here self.session = tf.Session() self.total_states = 0 self.saver = None self.config = create_config(config, default=self.default_config) # This is the scope used to prefix variable creation for distributed TensorFlow self.scope = scope self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) optimizer = config.get('optimizer') if not optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: args = config.get('optimizer_args', []) kwargs = config.get('optimizer_kwargs', {}) optimizer_cls = get_function(optimizer) self.optimizer = optimizer_cls(self.alpha, *args, **kwargs) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs)
def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder( tf.float32, self.batch_shape + list(self.config.state_shape), name="global_state") self.global_network = NeuralNetwork(self.define_network, [self.global_state]) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations()
def test_dqfd_agent(self): config = { "expert_sampling_ratio": 0.01, "supervised_weight": 0.5, "expert_margin": 1, 'batch_size': 8, 'state_shape': (2, ), 'actions': 2, 'action_shape': (), 'update_rate': 1, 'update_repeat': 4, 'min_replay_size': 20, 'memory_capacity': 20, "exploration": "epsilon_decay", "exploration_param": { "epsilon": 0, "epsilon_final": 0, "epsilon_states": 0 }, 'target_network_update_rate': 1.0, 'use_target_network': True, "alpha": 0.00004, "gamma": 1, "tau": 1.0 } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork. \ layered_network(layers=[{'type': 'dense', 'num_outputs': 16, 'weights_regularizer': 'tensorflow.contrib.layers.python.layers.regularizers.l2_regularizer', 'weights_regularizer_kwargs': { 'scale': 0.01 } }, {'type': 'linear', 'num_outputs': 2}]) agent = DQFDAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 # First: add to demo memory for n in xrange(50): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_demo_observation(state=state, action=action, reward=reward, terminal=terminal) # Pre-train from demo data agent.pre_train(10000) # If pretraining worked, we should not need much more training for n in xrange(1000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: print('Passed after steps = {:d}'.format(n)) return print('sum = {:f}'.format(sum(rewards)))