def __init__(self, task, config): Algorithm.__init__(self, task, config) # Scale parameters. assert self.config.start_learning <= self.config.replay_capacity assert self.config.start_learning >= self.config.batch_size self.config.start_learning *= self.config.frame_skip self.config.sync_target *= self.config.frame_skip self.config.epsilon.over *= self.config.frame_skip # Preprocessing. self._preprocess = self._create_preprocess() Experience.__init__(self, self._preprocess.above_task) # Network. self._model = Model(self._create_network) self._target = Model(self._create_network) self._target.weights = self._model.weights self._sync_target = Every(self.config.sync_target, self.config.start_learning) print(str(self._model)) # Learning. observ_shape = self._preprocess.above_task.observs.shape shapes = (observ_shape, tuple(), tuple(), observ_shape) self._memory = Memory(self.config.replay_capacity, shapes) self._log_memory_size() self._learning_rate = Decay(self.config.initial_learning_rate, 0, self.task.steps) self._cost_metric = Metric(self.task, 'dqn/cost', 1) self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1)
def __init__(self, task, config): Algorithm.__init__(self, task, config) # Parse parameters (until YAML 1.2 support). self.config.start_learning = int(float(self.config.start_learning)) self.config.sync_target = int(float(self.config.sync_target)) self.config.epsilon.over = int(float(self.config.epsilon.over)) # Scale parameters. assert self.config.start_learning <= self.config.replay_capacity assert self.config.start_learning >= self.config.batch_size self.config.start_learning *= self.config.frame_skip self.config.sync_target *= self.config.frame_skip self.config.epsilon.over *= self.config.frame_skip # Preprocessing. self._preprocess = self._create_preprocess() Experience.__init__(self, self._preprocess.above_task) # Network. self._model = Model(self._create_network) self._target = Model(self._create_network) self._target.weights = self._model.weights self._sync_target = Every(self.config.sync_target, self.config.start_learning) # print(str(self._model)) # Learning. self._memory = Memory(int(float(self.config.replay_capacity))) self._learning_rate = Decay(float(self.config.initial_learning_rate), 0, self.task.steps) self._cost_metric = Metric(self.task, 'dqn/cost', 1) self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1)
def __init__(self, task, config): super().__init__(task, config) self._preprocess = self._create_preprocess() self.model = Model(self._create_network) # print(str(self.model)) self.learning_rate = Decay( float(config.initial_learning_rate), 0, self.task.steps) self.costs = None self.values = None self.choices = None
def train_policies(self): trainers = [] for _ in range(self.config.learners): config = AttrDict(self.config.copy()) model = Model(self._create_network, threads=1) model.weights = self.model.weights policy = Sequential(self.task) policy.add(self._create_preprocess()) policy.add(Train, config, self, model) trainers.append(policy) return trainers
class A3C(Algorithm): """ Algorithm: Asynchronous Advantage Actor Critic (A3C) Paper: Asynchronous Methods for Deep Reinforcement Learning Authors: Mnih et al. 2016 PDF: https://arxiv.org/pdf/1602.01783v2.pdf """ @classmethod def defaults(cls): # Preprocessing. subsample = 2 frame_skip = 4 history = 4 delta = False frame_max = 2 noop_max = 30 # Architecture. learners = 16 apply_gradient = 5 network = 'network_a3c_lstm' scale_critic_loss = 0.5 regularize = 0.01 # Optimizer. initial_learning_rate = 7e-4 optimizer = tf.train.RMSPropOptimizer rms_decay = 0.99 rms_epsilon = 0.1 return merge_dicts(super().defaults(), locals()) def __init__(self, task, config): super().__init__(task, config) self._preprocess = self._create_preprocess() self.model = Model(self._create_network) # print(str(self.model)) self.learning_rate = Decay(float(config.initial_learning_rate), 0, self.task.steps) self.lock = Lock() self.costs = None self.values = None self.choices = None @property def train_policies(self): trainers = [] for _ in range(self.config.learners): config = AttrDict(self.config.copy()) # TODO: Use single model to share RMSProp statistics. model = Model(self._create_network, threads=1) model.weights = self.model.weights policy = Sequential(self.task) policy.add(self._create_preprocess()) policy.add(Train, config, self, model) trainers.append(policy) return trainers @property def test_policy(self): policy = Sequential(self.task) policy.add(self._preprocess) policy.add(Test, self.model) return policy def begin_epoch(self): super().begin_epoch() self.costs = [] self.values = [] self.choices = [] def end_epoch(self): super().end_epoch() if self.costs: average = sum(self.costs) / len(self.costs) print('Cost {:12.5f}'.format(average)) if self.values: average = sum(self.values) / len(self.values) print('Value {:12.5f}'.format(average)) if self.choices: dist = np.bincount(self.choices) / len(self.choices) dist = ' '.join('{:.2f}'.format(x) for x in dist) print('Choices [{}]'.format(dist)) if self.task.directory: self.model.save(self.task.directory, 'model') def _create_network(self, model): observs = self._preprocess.above_task.observs actions = self._preprocess.above_task.actions # Perception. state = model.add_input('state', observs.shape) hidden = getattr(networks, self.config.network)(model, state) value = model.add_output( 'value', tf.squeeze(dense(hidden, 1, tf.identity), [1])) policy = dense(value, actions.n, tf.nn.softmax) model.add_output('choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1])) # Objectives. action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions.n) return_ = model.add_input('return_') logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13) entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy) advantage = tf.stop_gradient(return_ - value) actor = advantage * logprob + self.config.regularize * entropy critic = self.config.scale_critic_loss * (return_ - value)**2 / 2 # Training. learning_rate = model.add_option( 'learning_rate', float(self.config.initial_learning_rate)) model.set_optimizer( self.config.optimizer(learning_rate, self.config.rms_decay, use_locking=True)) model.add_cost('cost', critic - actor) def _create_preprocess(self): policy = Sequential(self.task) if self.config.noop_max: policy.add(RandomStart, self.config.noop_max) if self.config.frame_skip: policy.add(Skip, self.config.frame_skip) if self.config.frame_max: policy.add(Maximum, self.config.frame_max) if self.config.history: policy.add(Grayscale) if self.config.subsample > 1: sub = self.config.subsample amount = (sub, sub) if self.config.history else (sub, sub, 1) policy.add(Subsample, amount) if self.config.delta: policy.add(Delta) if self.config.history: policy.add(History, self.config.history) policy.add(ClampReward) policy.add(Normalize) return policy
class DQN(Algorithm, Experience): """ Algorithm: Deep Q-Network (DQN) Paper: Human-level control through deep reinforcement learning Authors: Mnih et al. 2015 PDF: https://goo.gl/Y3e373 """ @classmethod def defaults(cls): # Preprocessing. subsample = 2 frame_skip = 4 history = 4 delta = False frame_max = 2 noop_max = 30 # Architecture. network = 'network_dqn_2015' replay_capacity = 1e5 # 1e6 start_learning = 5e4 # Exploration. epsilon = dict(from_=1.0, to=0.1, test=0.05, over=1e6, offset=start_learning) # Learning. batch_size = 32 sync_target = 2500 # Optimizer. initial_learning_rate = 2.5e-4 optimizer = tf.train.RMSPropOptimizer rms_decay = 0.95 rms_epsilon = 0.1 return merge_dicts(super().defaults(), locals()) def __init__(self, task, config): Algorithm.__init__(self, task, config) # Scale parameters. assert self.config.start_learning <= self.config.replay_capacity assert self.config.start_learning >= self.config.batch_size self.config.start_learning *= self.config.frame_skip self.config.sync_target *= self.config.frame_skip self.config.epsilon.over *= self.config.frame_skip # Preprocessing. self._preprocess = self._create_preprocess() Experience.__init__(self, self._preprocess.above_task) # Network. self._model = Model(self._create_network) self._target = Model(self._create_network) self._target.weights = self._model.weights self._sync_target = Every(self.config.sync_target, self.config.start_learning) print(str(self._model)) # Learning. observ_shape = self._preprocess.above_task.observs.shape shapes = (observ_shape, tuple(), tuple(), observ_shape) self._memory = Memory(self.config.replay_capacity, shapes) self._log_memory_size() self._learning_rate = Decay(self.config.initial_learning_rate, 0, self.task.steps) self._cost_metric = Metric(self.task, 'dqn/cost', 1) self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1) def end_epoch(self): super().end_epoch() if self.task.directory: self._model.save(self.task.directory, 'model') def perform(self, observ): return self._model.compute('values', state=observ) def experience(self, observ, action, reward, successor): action = action.argmax() self._memory.append((observ, action, reward, successor)) if self.task.step < self.config.start_learning: return observ, action, reward, successor = \ self._memory.sample(self.config.batch_size) target = self._compute_target(reward, successor) if self._sync_target(self.task.step): self._target.weights = self._model.weights self._model.set_option('learning_rate', self._learning_rate(self.task.step)) cost = self._model.train('cost', state=observ, action=action, target=target) self._learning_rate_metric(self._model.get_option('learning_rate')) self._cost_metric(cost) @property def policy(self): # TODO: Why doesn't self.task work here? policy = Sequential(self._preprocess.task) policy.add(self._preprocess) policy.add(self) return policy def _create_preprocess(self): policy = Sequential(self.task) policy.add(Image) if self.config.noop_max: policy.add(RandomStart, self.config.noop_max) if self.config.frame_skip > 1: policy.add(Skip, self.config.frame_skip) if self.config.frame_max: policy.add(Maximum, self.config.frame_max) if self.config.history > 1: channels = policy.above_task.observs.shape[-1] policy.add(Grayscale, (0.299, 0.587, 0.114)[:channels]) if self.config.subsample > 1: sub = self.config.subsample amount = (sub, sub) if self.config.history > 1 else (sub, sub, 1) policy.add(Subsample, amount) if self.config.delta: policy.add(Delta) if self.config.history > 1: policy.add(History, self.config.history) policy.add(Normalize) policy.add(ClampReward) policy.add(EpsilonGreedy, **self.config.epsilon) return policy def _create_network(self, model): observs = self._preprocess.above_task.observs.shape actions = self._preprocess.above_task.actions.shape[0] # Percetion. state = model.add_input('state', observs) hidden = getattr(networks, self.config.network)(model, state) values = dense(hidden, actions, tf.identity) values = model.add_output('values', values) # Training. action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions) target = model.add_input('target') model.add_output('value', tf.reduce_max(values, 1)) # Opimization. learning_rate = model.add_option('learning_rate', self.config.initial_learning_rate) model.set_optimizer( self.config.optimizer(learning_rate=learning_rate, decay=self.config.rms_decay, epsilon=self.config.rms_epsilon)) model.add_cost('cost', (tf.reduce_sum(action * values, 1) - target)**2) def _compute_target(self, reward, successor): terminal = np.isnan(successor.reshape((len(successor), -1))).any(1) successor = np.nan_to_num(successor) assert np.isfinite(successor).all() future = self._target.compute('value', state=successor) future[terminal] = 0 target = reward + self.config.discount * future assert np.isfinite(target).all() return target def _log_memory_size(self): size = self._memory.nbytes / (1024**3) print('Replay memory size', round(size, 2), 'GB')