def add_episode(exp_buffer): episode = exp_buffer.create_episode() step1 = data.Step(np.random.rand(80, 80), 1, 0.0, False) step2 = data.Step(np.random.rand(80, 80), 0, 0.0, False) terminal = data.Step(np.random.rand(80, 80), 1, 0.0, True) episode.append(step1) episode.append(step2) episode.append(terminal) episode.end() return step1, step2, terminal
def test_buffered(): obs = np.array([0.0, 1.0, 2.0]) action = 1 reward = 0.0 s = data.Step(obs, action, reward, False) r = data.Step(obs, action, 1.0, True) episode = [s, s, s, s, s, s, s, r] bds = data.BufferedRolloutDataset(discount_factor=0.99) for step in episode: bds.append(step.observation, step.action, step.reward, step.done) # for step in bds.rollouts[0]: # print(step.advantage) assert abs(bds.rollouts[0].advantage - 0.932) <= 0.01
def test_epi_batching(db): env_config = configs.BaseConfig( 'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float))) rollout = db.create_rollout(env_config) episode = rollout.create_episode() for _ in range(3): o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False), batch=10) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 1.0, False), batch=10) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False), batch=10) assert len(episode) == 0 episode.append(data.Step(o3, 1, 1.0, False), batch=10) assert len(episode) == 10 episode = rollout.create_episode() for _ in range(3): o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False), batch=10) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 1.0, False), batch=10) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False), batch=10) episode.end() assert len(episode) == 9
def test_redis_write_step(db): env_config = configs.BaseConfig( 'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float))) rollout = db.create_rollout(env_config) lookup = {} episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.end() episode = rollout.create_episode() o2 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.end() episode = rollout.create_episode() o3 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.append(data.Step(o1, 1, 1.0, False)) episode.end() rollout.finalize() step = rollout[0] step = rollout[1] step = rollout[2] np.testing.assert_array_equal(step.observation, o1) step = rollout[3] step = rollout[4] step = rollout[5] np.testing.assert_array_equal(step.observation, o1) step = rollout[6] step = rollout[7] step = rollout[8] np.testing.assert_array_equal(step.observation, o1) assert len(rollout) == 9 assert step.reward == 1.0 assert step.action == 1
def test_step_iter(db): env_config = configs.BaseConfig( 'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float))) rollout = db.create_rollout(env_config) episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 1.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) episode.end() assert len(episode) == 3 for step, o in zip(episode, [o1, o2, o3]): np.testing.assert_array_equal(step.observation, o)
def test_encode_decode(): o = np.random.rand(80, 80) s = data.Step(o, 1, 1.0, False) coder = data.StepCoder(observation_coder=data.NumpyCoder(2, np.float64)) encoded = coder.encode(s) decoded = coder.decode(encoded) assert s.action == decoded.action assert s.reward == decoded.reward assert s.done == decoded.done np.testing.assert_array_equal(s.observation, decoded.observation)
def populate(): obs = np.array([0.0, 1.0, 2.0]) action = 1 reward = 0.0 s = data.Step(obs, action, reward, False) r = data.Step(obs, action, 1.0, True) episode = [s, s, s, s, s, s, s, r] bds = data.BufferedRolloutDataset(discount_factor=0.99) for step in episode: bds.append(step.observation, step.action, step.reward, step.done, episode='player1') bds.append(step.observation, step.action, step.reward, step.done, episode='player2') return bds
def test_epi_iter(db): env_config = configs.BaseConfig( 'test', StepCoder(observation_coder=NumpyCoder(2, dtype=np.float64))) rollout = db.create_rollout(env_config) lookup = {} episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 1.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) lookup[episode.id] = [o1, o2, o3] episode.end() episode = rollout.create_episode() o4 = np.random.rand(80, 80) episode.append(data.Step(o4, 1, 1.0, False)) o5 = np.random.rand(80, 80) episode.append(data.Step(o5, 1, 1.0, False)) o6 = np.random.rand(80, 80) episode.append(data.Step(o6, 1, 1.0, False)) lookup[episode.id] = [o4, o5, o6] episode.end() episode = rollout.create_episode() o7 = np.random.rand(80, 80) episode.append(data.Step(o7, 1, 1.0, False)) o8 = np.random.rand(80, 80) episode.append(data.Step(o8, 1, 1.0, False)) o9 = np.random.rand(80, 80) episode.append(data.Step(o9, 1, 1.0, False)) lookup[episode.id] = [o7, o8, o9] episode.end() rollout.finalize() assert len(rollout) == 9 for episode in rollout: assert len(episode) == 3 ob = lookup[episode.id] for step, o in zip(episode, ob): np.testing.assert_array_equal(step.observation, o)
def test_epi_total_reward(db): env_config = configs.BaseConfig( 'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float))) rollout = db.create_rollout(env_config) episode = rollout.create_episode() assert episode.total_reward() == 0 o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 1.0, False)) assert episode.total_reward() == 1.0 o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 1.0, False)) assert episode.total_reward() == 2.0 o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) assert episode.total_reward() == 3.0
def test_concurrency(db): config = configs.DiscreteConfig(gym_env_string='test', features=(80, 80), features_dtype=np.float64, action_map=[0, 1]) rollout = db.create_rollout(config) obs = [] episode = rollout.create_episode() o1 = np.random.rand(80, 80) a = config.default_action episode.append(data.Step(o1, a, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, a, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, a, 1.0, False)) # sentinel containing last observation episode.append(data.Step(o3, a, 0.0, True)) obs.append([o1, o2, o3]) episode.end() episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, a, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, a, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, a, 1.0, False)) # sentinel containing last observation episode.append(data.Step(o3, a, 0.0, True)) obs.append([o1, o2, o3]) episode.end() rollout = db.latest_rollout(config) dataset = data.SARAdvantageDataset( rollout, discount_factor=0.99, state_transform=config.transform, action_transform=config.action_transform) episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, a, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, a, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, a, 1.0, False)) # sentinel containing last observation episode.append(data.Step(o3, a, 0.0, True)) obs.append([o1, o2, o3]) episode.end() loader = DataLoader(dataset, batch_size=65, shuffle=True) for minibatch in loader: pass
def test_advantage(db): config = configs.DiscreteConfig(gym_env_string='test', features=(80, 80), features_dtype=np.float64, action_map=[0, 1, 6]) config.discount_factor = 0.99 rollout = db.create_rollout(config) obs = [] episode = rollout.create_episode() o1 = np.random.rand(80, 80) a = config.default_action episode.append(data.Step(o1, a, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 6, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) # sentinel containing last observation episode.append(data.Step(o3, a, 0.0, True)) obs.append([o1, o2, o3]) episode.end() rollout_id = rollout.id rollout = db.latest_rollout(config) assert rollout.id == rollout_id dataset = data.SARAdvantageDataset( rollout, discount_factor=0.99, state_transform=config.transform, action_transform=config.action_transform) a = [] a.append(1.0 * config.discount_factor**2) a.append(1.0 * config.discount_factor) a.append(1.0) mu = mean(a) sigma = stdev(a) adv = [(vl - mu) / (sigma + 1e-12) for vl in a] assert len(dataset) == 3 observation, action, reward, advantage = dataset[0] assert reward == 0.00 assert advantage == adv[0] assert action == config.default_action observation, action, reward, advantage = dataset[1] assert reward == 0.0 assert advantage == adv[1] assert action.item() == 2 observation, action, reward, advantage = dataset[2] assert reward == 1.0 assert advantage == adv[2] assert action.item() == 1
def add_dud_episode(exp_buffer): episode = exp_buffer.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 0, 0.0, True)) episode.end()
def test_advantage(db): env_config = configs.BaseConfig( 'test_data', StepCoder(observation_coder=NumpyCoder(2, dtype=np.float64))) rollout = db.create_rollout(env_config) obs = [] episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) obs.append([o1, o2, o3]) episode.end() episode = rollout.create_episode() o1 = np.random.rand(80, 80) episode.append(data.Step(o1, 1, 0.0, False)) o2 = np.random.rand(80, 80) episode.append(data.Step(o2, 1, 0.0, False)) o3 = np.random.rand(80, 80) episode.append(data.Step(o3, 1, 1.0, False)) obs.append([o1, o2, o3]) episode.end() rollout.finalize() dataset = data.RolloutDatasetBase(env_config, rollout) a = [] a.append(1.0 * env_config.discount_factor**2) a.append(1.0 * env_config.discount_factor) a.append(1.0) a.append(1.0 * env_config.discount_factor**2) a.append(1.0 * env_config.discount_factor) a.append(1.0) mu = mean(a) sigma = stdev(a) adv = [(vl - mu) / (sigma + 1e-12) for vl in a] observation, action, reward, advantage = dataset[0] assert reward == 0.00 assert advantage == adv[0] observation, action, reward, advantage = dataset[1] assert reward == 0.0 assert advantage == adv[1] observation, action, reward, advantage = dataset[2] assert reward == 1.0 assert advantage == adv[2] observation, action, reward, advantage = dataset[3] assert reward == 0.00 assert advantage == adv[3] observation, action, reward, advantage = dataset[4] assert reward == 0.0 assert advantage == adv[4] observation, action, reward, advantage = dataset[5] assert reward == 1.0 assert advantage == adv[5]