def reset(self): # core.log("[ai] resetting environment ...") self._epoch_num = 0 state = self._next_epoch() self.last['state'] = state self.last['state_v'] = featurizer.featurize(state, 1) return self.last['state_v']
def step(self, policy): # create the parameters from the policy and update # update them in the algorithm self._apply_policy(policy) self._epoch_num += 1 # wait for the algorithm to run with the new parameters state = self._next_epoch() self.last['reward'] = state['reward'] self.last['state'] = state self.last['state_v'] = featurizer.featurize(state, self._epoch_num) self._agent.on_ai_step() return self.last['state_v'], self.last['reward'], not self._agent.is_training(), {}