def reset(self): self.state = np.array([np.random.uniform(low=-0.6, high=-0.4), 0]) self.observation = Observation( reward=0.0, state=np.array(self.state), is_episode_over=self.is_over() ) self.steps_elapsed = 0 return self.observe()
def reset(self): self.state = np.random.uniform(low=-0.05, high=0.05, size=(4, )) self.steps_beyond_done = None self.steps_elapsed = 0 self.observation = Observation(reward=0.0, state=np.array(self.state), is_episode_over=self.is_over()) return self.observe()
def observe(self): self.observation = Observation(reward=self.reward, state=np.array([ cos(self.state[0]), np.sin(self.state[0]), cos(self.state[1]), sin(self.state[1]), self.state[2], self.state[3] ]), is_episode_over=self.is_over()) return self.observation
def reset(self): self.state = np.random.uniform(low=-0.1, high=0.1, size=(4, )) self.steps_elapsed = 0 self.reward = 0.0 self.observation = Observation(reward=self.reward, state=np.array([ cos(self.state[0]), np.sin(self.state[0]), cos(self.state[1]), sin(self.state[1]), self.state[2], self.state[3] ]), is_episode_over=self.is_over()) return self.observe()
def __init__(self, environment, task=TARGET): super(SelfPlayTarget, self).__init__(environment=environment, task=task) self.name = SELFPLAY + "_" + TARGET + "_" + self.environment.name self.alice_start_environment = None self.agent_id = 1 self.agents = (BOB) self.observation = Observation() self.alice_observations = None self.bob_observations = ObservationTuple() _all_possible_actions = self.environment.all_possible_actions() self.stop_action = None self.actions = _all_possible_actions self.is_over = None self.task = task
def observe(self): game_observation = self.game.observe() # Logic borrowed from: # https://github.com/facebook/MazeBase/blob/23454fe092ecf35a8aab4da4972f231c6458209b/py/example.py#L192 obs, info = game_observation[OBSERVATION] featurizers.grid_one_hot(self.game, obs) obs = np.array(obs) featurizers.vocabify(self.game, info) info = np.array(obs) game_observation[OBSERVATION] = np.concatenate((obs, info), 2).flatten() is_episode_over = self.game.is_over() return Observation(id=game_observation[ID], reward=game_observation[REWARD], state=game_observation[OBSERVATION], is_episode_over=is_episode_over)
def __init__(self, environment, task=None): super(SelfPlay, self).__init__() self.environment = environment self.name = SELFPLAY + "_" + self.environment.name self.alice_start_environment = None self.alice_end_environment = None # The environment (along with the state) in which alice starts self.agent_id = 0 self.agents = (ALICE, BOB) self.observation = Observation() self.alice_observations = ObservationTuple() self.bob_observations = ObservationTuple() _all_possible_actions = self.environment.all_possible_actions() self.stop_action = len(_all_possible_actions) self.actions = _all_possible_actions self.is_over = None self.task = task
def act(self, action): assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) position, velocity = self.state velocity += (action - 1) * 0.001 + math.cos(3 * position) * (-0.0025) velocity = np.clip(velocity, -self.max_speed, self.max_speed) position += velocity position = np.clip(position, self.min_position, self.max_position) if (position == self.min_position and velocity < 0): velocity = 0 reward = -1.0 self.state = (position, velocity) self.steps_elapsed += 1 self.observation = Observation(reward=reward, state=np.array(self.state), is_episode_over=self.is_over()) return self.observe()
def act(self, action): assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) state = self.state x, x_dot, theta, theta_dot = state force = self.force_mag if action == 1 else -self.force_mag costheta = math.cos(theta) sintheta = math.sin(theta) temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc self.state = (x, x_dot, theta, theta_dot) self.steps_elapsed += 1 done = self.is_over() if not done: reward = 1.0 elif self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 1.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior." ) self.steps_beyond_done += 1 reward = 0.0 self.observation = Observation(reward=reward, state=np.array(self.state), is_episode_over=self.is_over()) return self.observe()
def act(self, action): s = self.state torque = self.AVAIL_TORQUE[action] # Add noise to the force action if self.torque_noise_max > 0: torque += np.random.uniform(-self.torque_noise_max, self.torque_noise_max) # Now, augment the state with our force action so it can be passed to # _dsdt s_augmented = np.append(s, torque) ns = rk4(self._dsdt, s_augmented, [0, self.dt]) # only care about final timestep of integration returned by integrator ns = ns[-1] ns = ns[:4] # omit action # ODEINT IS TOO SLOW! # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt]) # self.s_continuous = ns_continuous[-1] # We only care about the state # at the ''final timestep'', self.dt ns[0] = wrap(ns[0], -pi, pi) ns[1] = wrap(ns[1], -pi, pi) ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) self.state = ns self.steps_elapsed += 1 is_over = self.is_over() self.reward = -1. if not is_over else 0. self.observation = Observation(reward=self.reward, state=np.array([ cos(ns[0]), np.sin(ns[0]), cos(ns[1]), sin(ns[1]), ns[2], ns[3] ]), is_episode_over=self.is_over()) return self.observe()