コード例 #1
0
 def reset(self):
     self.state = np.array([np.random.uniform(low=-0.6, high=-0.4), 0])
     self.observation = Observation(
         reward=0.0,
         state=np.array(self.state),
         is_episode_over=self.is_over()
     )
     self.steps_elapsed = 0
     return self.observe()
コード例 #2
0
    def reset(self):
        self.state = np.random.uniform(low=-0.05, high=0.05, size=(4, ))
        self.steps_beyond_done = None
        self.steps_elapsed = 0

        self.observation = Observation(reward=0.0,
                                       state=np.array(self.state),
                                       is_episode_over=self.is_over())

        return self.observe()
コード例 #3
0
 def observe(self):
     self.observation = Observation(reward=self.reward,
                                    state=np.array([
                                        cos(self.state[0]),
                                        np.sin(self.state[0]),
                                        cos(self.state[1]),
                                        sin(self.state[1]), self.state[2],
                                        self.state[3]
                                    ]),
                                    is_episode_over=self.is_over())
     return self.observation
コード例 #4
0
 def reset(self):
     self.state = np.random.uniform(low=-0.1, high=0.1, size=(4, ))
     self.steps_elapsed = 0
     self.reward = 0.0
     self.observation = Observation(reward=self.reward,
                                    state=np.array([
                                        cos(self.state[0]),
                                        np.sin(self.state[0]),
                                        cos(self.state[1]),
                                        sin(self.state[1]), self.state[2],
                                        self.state[3]
                                    ]),
                                    is_episode_over=self.is_over())
     return self.observe()
コード例 #5
0
 def __init__(self, environment, task=TARGET):
     super(SelfPlayTarget, self).__init__(environment=environment,
                                          task=task)
     self.name = SELFPLAY + "_" + TARGET + "_" + self.environment.name
     self.alice_start_environment = None
     self.agent_id = 1
     self.agents = (BOB)
     self.observation = Observation()
     self.alice_observations = None
     self.bob_observations = ObservationTuple()
     _all_possible_actions = self.environment.all_possible_actions()
     self.stop_action = None
     self.actions = _all_possible_actions
     self.is_over = None
     self.task = task
コード例 #6
0
 def observe(self):
     game_observation = self.game.observe()
     # Logic borrowed from:
     # https://github.com/facebook/MazeBase/blob/23454fe092ecf35a8aab4da4972f231c6458209b/py/example.py#L192
     obs, info = game_observation[OBSERVATION]
     featurizers.grid_one_hot(self.game, obs)
     obs = np.array(obs)
     featurizers.vocabify(self.game, info)
     info = np.array(obs)
     game_observation[OBSERVATION] = np.concatenate((obs, info),
                                                    2).flatten()
     is_episode_over = self.game.is_over()
     return Observation(id=game_observation[ID],
                        reward=game_observation[REWARD],
                        state=game_observation[OBSERVATION],
                        is_episode_over=is_episode_over)
コード例 #7
0
 def __init__(self, environment, task=None):
     super(SelfPlay, self).__init__()
     self.environment = environment
     self.name = SELFPLAY + "_" + self.environment.name
     self.alice_start_environment = None
     self.alice_end_environment = None
     # The environment (along with the state) in which alice starts
     self.agent_id = 0
     self.agents = (ALICE, BOB)
     self.observation = Observation()
     self.alice_observations = ObservationTuple()
     self.bob_observations = ObservationTuple()
     _all_possible_actions = self.environment.all_possible_actions()
     self.stop_action = len(_all_possible_actions)
     self.actions = _all_possible_actions
     self.is_over = None
     self.task = task
コード例 #8
0
    def act(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))

        position, velocity = self.state
        velocity += (action - 1) * 0.001 + math.cos(3 * position) * (-0.0025)
        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
        position += velocity
        position = np.clip(position, self.min_position, self.max_position)
        if (position == self.min_position and velocity < 0): velocity = 0

        reward = -1.0
        self.state = (position, velocity)
        self.steps_elapsed += 1

        self.observation = Observation(reward=reward,
                                       state=np.array(self.state),
                                       is_episode_over=self.is_over())
        return self.observe()
コード例 #9
0
    def act(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action == 1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot *
                sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length *
            (4.0 / 3.0 -
             self.masspole * costheta * costheta / self.total_mass))
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
        self.state = (x, x_dot, theta, theta_dot)
        self.steps_elapsed += 1

        done = self.is_over()

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn(
                    "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_done += 1
            reward = 0.0

        self.observation = Observation(reward=reward,
                                       state=np.array(self.state),
                                       is_episode_over=self.is_over())
        return self.observe()
コード例 #10
0
    def act(self, action):
        s = self.state
        torque = self.AVAIL_TORQUE[action]

        # Add noise to the force action
        if self.torque_noise_max > 0:
            torque += np.random.uniform(-self.torque_noise_max,
                                        self.torque_noise_max)

        # Now, augment the state with our force action so it can be passed to
        # _dsdt
        s_augmented = np.append(s, torque)

        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
        # only care about final timestep of integration returned by integrator
        ns = ns[-1]
        ns = ns[:4]  # omit action
        # ODEINT IS TOO SLOW!
        # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt])
        # self.s_continuous = ns_continuous[-1] # We only care about the state
        # at the ''final timestep'', self.dt

        ns[0] = wrap(ns[0], -pi, pi)
        ns[1] = wrap(ns[1], -pi, pi)
        ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
        ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
        self.state = ns
        self.steps_elapsed += 1

        is_over = self.is_over()
        self.reward = -1. if not is_over else 0.

        self.observation = Observation(reward=self.reward,
                                       state=np.array([
                                           cos(ns[0]),
                                           np.sin(ns[0]),
                                           cos(ns[1]),
                                           sin(ns[1]), ns[2], ns[3]
                                       ]),
                                       is_episode_over=self.is_over())
        return self.observe()