def policy(task, step): policy = Sequential(task) print('Step: ', step.__name__) print('Input: ', policy.task.observs) policy.add(step) print('Output:', policy.task.actions) policy.add(Random) return policy
def train_policies(self): trainers = [] for _ in range(self.config.learners): config = AttrDict(self.config.copy()) model = Model(self._create_network, threads=1) model.weights = self.model.weights policy = Sequential(self.task) policy.add(self._create_preprocess()) policy.add(Train, config, self, model) trainers.append(policy) return trainers
def _create_preprocess(self): policy = Sequential(self.task) # Network preprocessing. # policy.add(Skip, self.config.frameskip) # policy.add(Maximum, 2) # policy.add(Grayscale) # # policy.add(Subsample, (2, 2)) # # policy.add(Delta) # policy.add(History, 3) # policy.add(ClampReward) # policy.add(EpsilonGreedy, from_=0.5, to=0.5, test=0.5) policy.add(Normalize) return policy
def test_policy(self): policy = Sequential(self.task) policy.add(self._preprocess) policy.add(Test, self.model) return policy
def _create_preprocess(self): policy = Sequential(self.task) if self.config.noop_max: policy.add(RandomStart, self.config.noop_max) if self.config.frame_skip: policy.add(Skip, self.config.frame_skip) if self.config.frame_max: policy.add(Maximum, self.config.frame_max) if self.config.history: policy.add(Grayscale) if self.config.subsample > 1: sub = self.config.subsample amount = (sub, sub) if self.config.history else (sub, sub, 1) policy.add(Subsample, amount) if self.config.delta: policy.add(Delta) if self.config.history: policy.add(History, self.config.history) policy.add(ClampReward) policy.add(Normalize) return policy
def _create_preprocess(self): policy = Sequential(self.task) policy.add(Image) if self.config.noop_max: policy.add(RandomStart, self.config.noop_max) if self.config.frame_skip > 1: policy.add(Skip, self.config.frame_skip) if self.config.frame_max: policy.add(Maximum, self.config.frame_max) if self.config.history > 1: channels = policy.above_task.observs.shape[-1] policy.add(Grayscale, (0.299, 0.587, 0.114)[:channels]) if self.config.subsample > 1: sub = self.config.subsample amount = (sub, sub) if self.config.history > 1 else (sub, sub, 1) policy.add(Subsample, amount) if self.config.delta: policy.add(Delta) if self.config.history > 1: policy.add(History, self.config.history) policy.add(Normalize) policy.add(ClampReward) policy.add(EpsilonGreedy, **self.config.epsilon) return policy
def policy(self): # TODO: Why doesn't self.task work here? policy = Sequential(self._preprocess.task) policy.add(self._preprocess) policy.add(self) return policy
def policy(self): policy = Sequential(self._preprocess.task) policy.add(self._preprocess) policy.add(self) return policy
def _prepend_score_step(self, policy): combined = Sequential(policy.task) combined.add(Score) combined.add(policy) return combined