Ejemplo n.º 1
0
    def run(self):
        """
        Run the agent to see it work
        """
        from gym.wrappers import Monitor
        env = Monitor(self.env, './video', force=True)
        state = env.reset()
        reward_sum = 0
        episode_number = 0
        while episode_number < 2:
            # forward the policy network and sample an action from the returned probability
            aprob, h = policy_forward(state)

            action = 0 if np.random.uniform(
            ) < aprob else 1  # randomly take 1 of two actions. we are sampling from a bernoulli distribution here

            # step the environment and get new measurements
            state, reward, done, info = env.step(action)
            reward_sum += reward
            env.render()

            if done:  # an episode finished
                episode_number += 1
                print("Episode finished with total reward", reward_sum)
                reward_sum = 0
                state = env.reset()  # reset env
Ejemplo n.º 2
0
def run_video_agent(model, eps=500):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    env = gym.make('BipedalWalker-v3')
    env = Monitor(env,
                  './vid',
                  video_callable=lambda episode_id: True,
                  force=True)
    obs = env.reset()
    last_obs = obs

    fitness = 0.0

    for _ in range(eps):
        env.render()

        obs = torch.from_numpy(obs).float().to(device)
        action = (model(obs).detach()).cpu().numpy()
        new_obs, reward, done, info = env.step(action)
        fitness += reward
        obs = new_obs

        if done:
            break

    env.close()
    print("Best score ", fitness)
Ejemplo n.º 3
0
def simulate(env,
             agent,
             deterministic=True,
             num_episodes=3,
             render=True,
             wait_after_render=1e-3,
             render_kwargs=None,
             record_video=False):
    render_kwargs = render_kwargs or dict()

    assert env.max_episode_steps > 0
    if record_video:
        env = Monitor(env, directory='./data')

    episode_info = []
    for _ in range(num_episodes):
        obs = env.reset()
        agent.reset()
        done = False
        episode_return = 0
        t = 0
        while not done:
            if render:
                env.render(**render_kwargs)
                time.sleep(wait_after_render)

            with torch.no_grad():
                action = agent.act(obs, deterministic)
            obs, reward, done, _ = env.step(action)
            episode_return += reward
            t += 1
        episode_info.append((t, episode_return))

    return episode_info
Ejemplo n.º 4
0
def evaluate(agent, env, n_episodes=5, render=False, record=False):
    total_rewards = []

    if record:
        env = Monitor(env, './videos/', force=True)

    for episode in range(n_episodes):

        obs = env.reset()
        obs = obs_reshape(obs)
        total_reward = 0.0
        episode_length = 0

        done = False
        while not done:
            action = agent.act(obs.reshape(1, *obs.shape))
            next_obs, reward, done, _ = env.step(action[0])
            next_obs = obs_reshape(next_obs)
            obs = next_obs
            
            total_reward += reward
            episode_length += 1

            if render:
                env.render()                
                
        total_rewards.append(total_reward)
        
#         print(f">> episode = {episode + 1} / {n_episodes}, total_reward = {total_reward:10.4f}, episode_length = {episode_length}")
        
    if render:
        env.close()

    return np.mean(total_rewards)
Ejemplo n.º 5
0
def play(agent_dir, num_episodes, max_episode_steps, save_videos):
    agent = get_agent(gin.query_parameter("train.agent"))(make_env_fn(
        gin.query_parameter("train.env_id"),
        episode_time_limit=max_episode_steps))
    agent.pretrain_setup(gin.query_parameter("train.total_timesteps"))

    ckpt_path = tf.train.latest_checkpoint(
        os.path.join(agent_dir, "best-weights"))
    checkpoint = tf.train.Checkpoint(agent)
    checkpoint.restore(
        ckpt_path).assert_existing_objects_matched().expect_partial()

    env = agent.make_env()

    if save_videos:
        env = Monitor(
            env,
            os.path.join(agent_dir, "monitor"),
            video_callable=lambda _: True,
            force=True,
        )

    try:
        episodes = 0
        obs = env.reset()
        while episodes < num_episodes:
            action = agent.act(np.expand_dims(obs, 0),
                               deterministic=True).numpy()
            obs, _, done, _ = env.step(action[0])
            env.render()
            if done:
                obs = env.reset()
                episodes += 1
    except KeyboardInterrupt:
        env.close()
def test(model, args, verbose=True):
    # Initialize environment and model
    env = Monitor(gym.make(args.env), './recordings', force=True)
    model.eval()

    # Initialize variables
    done, ep_reward = False, []
    s = env.reset()
    hx, cx = init_hidden(1, args.size_hidden)

    # Generate rollout
    while not done:  # and step < env.spec.timestep_limit:

        # Render if enabled
        if args.render: env.render()

        # Take a step in environment
        logit, _, _, _ = model.forward(s, hx, cx)
        prob = F.softmax(logit, dim=-1)
        action = prob.multinomial(1).data
        s, r, done, _ = env.step(action.squeeze().numpy())
        ep_reward.append(r)

        if done: break

    # Close environment and show performance
    env.close()
    if verbose is True:
        print('Test agent achieved a reward of', np.sum(ep_reward))
Ejemplo n.º 7
0
def enjoy(policy,
          env,
          save_path=None,
          save_video=False,
          obs_fn=None,
          nepochs=100):
    """
        Enjoy and flush your result using Monitor class.
    """
    if save_video:
        assert save_path is not None, 'A path to save videos must be provided!'
    policy.cuda()
    policy.eval()
    if save_video:
        env = Monitor(env, directory=save_path)

    for e in range(0, 100):
        done = False
        obs = env.reset()
        episode_rwd = 0
        while not done:
            env.render()
            if obs_fn is not None:
                obs = obs_fn(obs)
            obs = Variable(torch.from_numpy(obs[np.newaxis])).float().cuda()
            value, action, logprob, mean = policy(obs)
            action = action.data[0].cpu().numpy()
            obs, reward, done, _ = env.step(action)
            episode_rwd += reward
        print('Episode reward is', episode_rwd)
Ejemplo n.º 8
0
def main():
    """
    You can test your game when you finish setting up your environment.
    Input range from 0 to 5:
        0 : South (Down)
        1 : North (Up)
        2 : East (Right)
        3 : West (Left)
        4: Pick up
        5: Drop off
    """

    GAME = "Assignment1-Taxi-v2"
    env = gym.make(GAME)
    n_state = env.observation_space.n
    n_action = env.action_space.n
    env = Monitor(env, "taxi_simple", force=True)

    s = env.reset()
    steps = 100
    for step in range(steps):
        env.render()
        action = int(input("Please type in the next action:"))
        s, r, done, info = env.step(action)
        print(s)
        print(r)
        print(done)
        print(info)

    # close environment and monitor
    env.close()
Ejemplo n.º 9
0
class Simulation():
    def __init__(self, environment="CartPole-v0", save_every=5):
        env = gym.make(environment)
        self.env = Monitor(
            env,
            './video',
            video_callable=lambda episode_no: episode_no % save_every == 0,
            force=True)
        if environment == "Pong-v0":
            self.env = wrap_deepmind(env, frame_stack=True, scale=True)
        self.environment = environment
        #self.env.seed(0)
    def reset(self):
        observation = self.env.reset()
        if self.environment == "Pong-v0":
            observation = torch.from_numpy(np.stack(observation)).transpose_(
                0, 2).transpose_(1, 2).float().unsqueeze(0)
        else:
            observation = torch.from_numpy(observation).float().unsqueeze(0)
        return observation

    def step(self, action):
        observation, reward, is_done, info = self.env.step(action)
        if self.environment == "Pong-v0":
            observation = torch.from_numpy(np.stack(observation)).transpose_(
                0, 2).transpose_(1, 2).float().unsqueeze(0)
        else:
            observation = torch.from_numpy(observation).float().unsqueeze(0)
        return observation, reward, is_done, info

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()
Ejemplo n.º 10
0
class Environment(object):
    def __init__(self, game, record=False, width=84, height=84, seed=0):
        self.game = gym.make(game)
        self.game.seed(seed)

        if record:
            self.game = Monitor(self.game, './video', force=True)

        self.width = width
        self.height = height
        self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()])
        gym_ple

    def play_sample(self, mode: str = 'human'):
        observation = self.game.reset()

        while True:
            screen = self.game.render(mode=mode)
            if mode == 'rgb_array':
                screen = self.preprocess(screen)
            action = self.game.action_space.sample()
            observation, reward, done, info = self.game.step(action)
            if done:
                break
        self.game.close()

    def preprocess(self, screen):
        preprocessed: np.array = cv2.resize(screen, (self.height, self.width))  # 84 * 84 로 변경
        preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114])  # Gray scale 로 변경
        # preprocessed: np.array = preprocessed.transpose((2, 0, 1))  # (C, W, H) 로 변경
        preprocessed: np.array = preprocessed.astype('float32') / 255.

        return preprocessed

    def init(self):
        """
        @return observation
        """
        return self.game.reset()

    def get_screen(self):
        screen = self.game.render('rgb_array')
        screen = self.preprocess(screen)
        return screen

    def step(self, action: int):
        observation, reward, done, info = self.game.step(action)
        return observation, reward, done, info

    def reset(self):
        """
        :return: observation array
        """
        observation = self.game.reset()
        observation = self.preprocess(observation)
        return observation

    @property
    def action_space(self):
        return self.game.action_space.n
Ejemplo n.º 11
0
def cart_pole_1():

    env = gym.make('CartPole-v0')
    # print('[cart_pole_1]', env.action_space)                    # Discrete(2)
    # print('[cart_pole_1]', env.observation_space)               # Box(4,)
    # # action取非负整数0或1。Box表示一个n维的盒子,因此observation是一个4维的数组。我们可以试试box的上下限。
    # print('[cart_pole_1]', env.observation_space.high)          # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
    # print('[cart_pole_1]', env.observation_space.low)           # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]

    env = Monitor(env=env,
                  directory='./tmp/cartpole-experiment-0202',
                  video_callable=False,
                  write_upon_reset=True)

    observation = env.reset()  # 重置环境的状态,返回观察

    for t in range(100):
        env.render()  # 重绘环境的一帧
        print('[cart_pole_1] observation old:', observation)
        action = env.action_space.sample()
        # action = t % 2
        print('[cart_pole_1] action', action)
        observation, reward, done, info = env.step(
            action)  # 推进一个时间步长,返回observation,reward,done,info
        print('[cart_pole_1] observation new:', observation,
              '[reward, done, info]:', reward, done, info)

        if done:
            print("[observation] Done after {} time steps".format(t + 1))
            break

    env.close()
def play(N=1000):

    # Change this to 'AssaultNoFrameskip-v4' to play the second game
    env = wrap_atari_deepmind('BreakoutNoFrameskip-v4', False)
    env = Monitor(env, directory + "/", force=True)
    agent.copy(DQN_online[4], sess_o)
    tot_reward = []
    episode = 1
    i = 0
    while i < N:

        r = 0
        s = env.reset()
        terminal = False
        episode_reward = 0
        while not terminal:

            env.render()
            a = agent.get_action(agent, env, np.array(s))
            s_next, r, terminal, dizi = env.step(a)
            episode_reward += r
            i = i + 1
            s = s_next
        tot_reward.append(episode_reward)
        print("Episode reward: ", episode_reward)
        episode = episode + 1
    env.close()
Ejemplo n.º 13
0
def TestDQNAgent(sess,
                 env,
                 q_value_estimator,
                 state_preprocessor,
                 num_episodes,
                 experiment_dir,
                 record_steps=1):

    EpisodeStats = namedtuple('Stats', ['episode_lengths', 'episode_rewards'])
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))

    ckpt_dir = os.path.join(experiment_dir, 'checkpoints')
    record_path = os.path.join(experiment_dir, 'record/tests/')

    if not os.path.exists(record_path):
        os.makedirs(record_path)

    saver = tf.train.Saver()
    latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir)
    if latest_checkpoint:
        print('\nLoading model checkpoint {}...'.format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())
    epsilon = 0.1
    policy = make_epsilon_greedy_policy(q_value_estimator, len(VALID_ACTIONS))

    env = Monitor(env, directory=record_path, video_callable=lambda count: count % record_steps == 0, resume=True)
    for i_episode in range(num_episodes):
        state = env.reset()
        state = state_preprocessor.process(sess, state)
        state = np.stack([state] * 4, axis=2)

        for t in itertools.count():
            env.render()

            print("\rStep {} ({}) | Episode {}/{}".format(t, total_t, i_episode + 1, num_episodes), end="")
            sys.stdout.flush()

            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_preprocessor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

            state = next_state
            total_t += 1

        episode_stats = EpisodeStats(episode_lengths=stats.episode_lengths[:i_episode + 1],
                                     episode_rewards=stats.episode_rewards[:i_episode + 1])
        yield total_t, episode_stats

    return stats
def main():
    finishedTraining = EPISODES
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("ResultsNew.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if (episode + 1) % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + "\n")
            if ave_reward > 800 and finishedTraining > episode + 300:
                finishedTraining = episode + 300
            elif (episode >= finishedTraining):
                break

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        env.reset()
        state = env.env.env.set_test(episode)
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
Ejemplo n.º 15
0
def main():
    """ Orchestrates agent and environment interactions. """
    # Create environment
    environment = gym.make(ENVIRONMENT)
    if RECORD:
        environment = Monitor(env=environment,
                              directory=VIDEO_DIRECTORY,
                              video_callable=lambda episode_id: True,
                              force=True)
    # Set random seeds
    environment.seed(0)
    np.random.seed(0)
    # Get action and state space sizes
    action_space = environment.action_space.n
    state_space = environment.observation_space.shape[0]
    # Instantiate agent
    agent = Agent(action_space, state_space)
    # Load model weights
    if path.exists(CHECKPOINT_DIRECTORY):
        agent.load(CHECKPOINT_DIRECTORY)
    # Initialise list of all rewards
    rewards = []
    for episode in range(EPISODES):
        # Get initial state
        state = environment.reset()
        state = np.reshape(state, (1, state_space))
        # Reset score for this episode
        score = 0
        for _ in range(STEPS):
            if RENDER:
                environment.render()
            # Agent selects action from state
            action = agent.act(state)
            # Agent performs action and makes an observation of the environment
            next_state, reward, done, _ = agent.observe(environment, action)
            next_state = np.reshape(next_state, (1, state_space))
            observation = (state, action, reward, next_state, done)
            # Agent remembers parameters of this time step
            agent.remember(observation)
            state = next_state
            # Agent retrains model
            agent.learn()
            score += reward
            if done:
                print("Episode: {}/{}. Reward: {:.2f}".format(
                    episode + 1, EPISODES, score))
                break
        rewards.append(score)
        # Average reward over the last 100 episodes
        average_reward = np.mean(rewards[-100:])
        print("Average reward: {:.2f}\n".format(average_reward))
    # Terminate environment
    environment.close()
    # Save model
    agent.save(CHECKPOINT_DIRECTORY)
    # Display performance over time
    summary(rewards)
Ejemplo n.º 16
0
def main(path, env_name, seed, render, n_test_rollouts=2):
    set_global_seeds(seed)

    # initialize environment
    env = gym.make(env_name)
    max_action = env.action_space.high

    # Load policy.
    save_recording = False

    if save_recording:
        saving_vid = '/media/flowers/3C3C66F13C66A59C/data_save/gym_recording/ddpg_cheetah_drop/' + weight_file[:
                                                                                                                -5]
        env = Monitor(env, saving_vid, force=True)
    # env.directory = '/media/flowers/3C3C66F13C66A59C/data_save/gym_recording/ddp_cheetah_drop'

    with tf.Session() as sess:
        # init = tf.global_variables_initializer()
        # sess.run(init)
        policy_file = glob.glob(path + '*.meta')[0]
        saver = tf.train.import_meta_graph(policy_file)
        saver.restore(sess, tf.train.latest_checkpoint(path))
        graph = tf.get_default_graph()
        obs0 = graph.get_tensor_by_name("obs0:0")
        actor_tf = graph.get_tensor_by_name("actor/Tanh:0")

        score = np.zeros([n_test_rollouts])
        successes = []
        for i in range(n_test_rollouts):
            done = False
            obs = env.reset()
            actions = []
            rewards = []
            observations = []
            while not done:
                inpt = obs
                feed_dict = {obs0: [inpt]}
                action = sess.run(actor_tf, feed_dict=feed_dict)
                actions.append(action)
                if render:
                    env.render()
                new_obs, r, done, info = env.step(action.flatten() *
                                                  max_action)
                observations.append(new_obs)
                rewards.append(r)
                obs = new_obs
            if 'is_success' in info.keys():
                successes.append(info['is_success'])
            score[i] = sum(rewards)
        success_rate = np.mean(successes)
        print('Success rate = %f' % success_rate)
        print(score.max())
        print(score.min())
Ejemplo n.º 17
0
def run(episodes=1):
    env = gym.make('obstacle-v0')
    env = Monitor(env, 'out', force=True)

    for _ in range(episodes):
        env.reset()
        env.unwrapped.automatic_rendering_callback = env.video_recorder.capture_frame  # Capture in-between frames
        done = False
        while not done:
            action = env.unwrapped.dynamics.desired_action
            observation, reward, done, info = env.step(action)
            env.render()
    env.close()
Ejemplo n.º 18
0
def test():
    env = gym.make(args.env)
    act = deepq.load(os.path.join(args.log_dir, args.log_fname))
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render(mode='test')
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Ejemplo n.º 19
0
def main():
    # initialize OpenAI Gym env and dqn agent
    env = gym.make(ENV_NAME)
    agent = DQN(env)

    for episode in range(EPISODE):
        # initialize task
        state = env.reset()
        # Train
        for step in range(STEP):
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            # Define reward for agent
            reward_agent = -1 if done else 0.1
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Test every 100 episodes
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(STEP):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: %f,Evaluation Average Reward:%f' %
                  (episode, ave_reward))
            # if ave_reward >= 200:
            # 	break

    # save results for uploading
    # env.monitor.start('gym_results/CartPole-v0-experiment-1',force = True)
    env = Monitor(env, 'gym_results/CartPole-v0-experiment-1', force=True)
    for i in range(100):
        state = env.reset()
        for j in range(200):
            env.render()
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
    env.close()
Ejemplo n.º 20
0
def validation(env):
    env = make_env(env)
    env = Monitor(env, './video',force=True)
    sonic = SonicAgent(env,TIMESTEPS_PER_EPISODE* EPISODES, True)
    sonic.load_model('sonic_model_final.h5')
    obs = env.reset()
    while True:
        action = sonic.policy(obs)
        #action = random.choice([a for a in range(env.action_space.n)])
        next_obs, reward, done, info = env.step(action)
        print("Para la accion #{} la recompensa es {}".format(action, reward))
        env.render()
        obs = next_obs
        if done:
            obs = env.close()
Ejemplo n.º 21
0
def test():
    env = envs.make(args.env, render = bool(args.render), record = bool(args.record))
    act = simple.load(os.path.join(args.log_dir, args.log_fname))        
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if args.render:
                env.render()
                time.sleep(0.05)
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Ejemplo n.º 22
0
def test():
    from baselines0.deepq.utils import BatchInput
    import json
    learning_prop = json.load(
        open(os.path.join(args.log_dir, 'learning_prop.json'), 'r'))

    env = make_atari(args.env)
    env = models.wrap_atari_dqn(env)
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)

    model = models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[learning_prop['num_units']] * learning_prop['num_layers'],
        dueling=bool(args.dueling),
        init_mean=args.init_mean,
        init_sd=args.init_sd,
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': model,
        'scope': learning_prop['scope'],
        'eps': args.test_eps
    }
    act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params)
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    episode_rew = 0
    t = 0
    while True:
        obs, done = env.reset(), False
        while (not done):
            if args.render:
                env.render()
                time.sleep(0.05)
            obs, rew, done, info = env.step(act(obs[None])[0])
            # Reset only the enviornment but not the recorder
            if args.record and done:
                obs, done = env.env.reset(), False
            episode_rew += rew
            t += 1
        if info['ale.lives'] == 0:
            print("Episode reward %.2f after %d steps" % (episode_rew, t))
            episode_rew = 0
            t = 0
Ejemplo n.º 23
0
def record_play(model, env):
    env = Monitor(env, './video', force=True)
    total_reward = 0
    state = env.reset()

    while True:
        action = model.choose_action(state)
        next_state, reward, done, _ = env.step(action)

        env.render()
        time.sleep(0.03)

        total_reward += reward
        state = next_state

        if done:
            return total_reward
Ejemplo n.º 24
0
def test():
    env = make_atari(args.env)
    env = deepq.wrap_atari_dqn(env)
    act = deepq.load(os.path.join(args.log_dir, args.log_fname))
    if args.record:
        env = Monitor(env, directory=args.log_dir)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        t = 0
        while not done:
            if not(args.record):
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
            t += 1
        print("Episode reward %.2f after %d steps"%(episode_rew, t))
Ejemplo n.º 25
0
def visualize(env, net_params):
    print('Testing....\n')
    display = Display(visible=0, size=(1400, 900))
    env = Monitor(env, './video', force=True)
    nn = create_nn(net_params)

    display.start()
    state = env.reset()

    for _ in range(CONFIG['ep_max_steps']):
        env.render()
        action = get_action(nn, state)
        state, _, done, _ = env.step(action)
        if done: break
    else:
        env.stats_recorder.save_complete()
        env.stats_recorder.done = True

    env.close()
    display.stop()
Ejemplo n.º 26
0
def recording(recording_env, recording_agent, weight_name):
    env = Monitor(recording_env, './video_%s'%(weight_name), force=True)
    # watch an trained agent
    window = []
    n_epsiode = 10
    for _ in range(n_epsiode):
        # add Recording tigger
        state = env.reset()
        total_score = 0
        for j in range(MAX_STEP):
            state = state/255.0
            action, _, _ = recording_agent.act(state, test=True)
            env.render()
            state, reward, done, _ = env.step(action)
            total_score += reward
            if done:
                break
        window.append(total_score)
        print('Total score for this episode {:.4f}'.format(total_score))
    print('Avg score {}'.format(np.mean(window)))
Ejemplo n.º 27
0
def main():

    GAME = "Assignment1-Taxi-v2"
    env = gym.make(GAME)
    n_state = env.observation_space.n
    n_action = env.action_space.n
    env = Monitor(env, "taxi_simple", force=True)

    s = env.reset()
    steps = 100
    for step in range(steps):
        env.render()
        action = int(input("Please type in the next action:"))
        s, r, done, info = env.step(action)
        print(s)
        print(r)
        print(done)
        print(info)

    env.close()
Ejemplo n.º 28
0
def main():
    env = gym.make(ENV_NAME)
    agent = Agent(num_actions=env.action_space.n)

    if TRAIN:  # Train mode
        for _ in range(NUM_EPISODES):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action(state)
                observation, reward, terminal, _ = env.step(action)
                # env.render()
                processed_observation = preprocess(observation,
                                                   last_observation)
                state = agent.run(state, action, reward, terminal,
                                  processed_observation)
    else:  # Test mode
        # env.monitor.start(ENV_NAME + '-test')
        env = Monitor(env, './SpaceInvaders-1', force=True)
        for _ in range(NUM_EPISODES_AT_TEST):
            terminal = False
            observation = env.reset()
            for _ in range(random.randint(1, NO_OP_STEPS)):
                last_observation = observation
                observation, _, _, _ = env.step(0)  # Do nothing
            state = agent.get_initial_state(observation, last_observation)
            while not terminal:
                last_observation = observation
                action = agent.get_action_at_test(state)
                observation, _, terminal, _ = env.step(action)
                env.render()
                processed_observation = preprocess(observation,
                                                   last_observation)
                state = np.append(state[1:, :, :],
                                  processed_observation,
                                  axis=0)
Ejemplo n.º 29
0
def test():
    from baselines0.deepq.utils import BatchInput

    env = make_atari(args.env)
    env = deepq.wrap_atari_dqn(env)
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)

    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[args.num_units] * args.num_layers,
        dueling=bool(args.dueling),
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': model,
        'scope': args.scope
    }
    act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params)
    if args.record:
        env = Monitor(env, directory=args.log_dir)
    episode_rew = 0
    t = 0
    while True:
        obs, done = env.reset(), False

        while not done:
            if not (args.record):
                env.render()
                #time.sleep(0.01)
            obs, rew, done, info = env.step(act(obs[None])[0])
            episode_rew += rew
            t += 1
        if info['ale.lives'] == 0:
            print("Episode reward %.2f after %d steps" % (episode_rew, t))
            episode_rew = 0
            t = 0
Ejemplo n.º 30
0
def run_random_agent(env_name='CartPole-v0'):
    env = Monitor(gym.make(env_name), './video')
    for i_episode in range(1):
        observation = env.reset()
        for t in range(100):
            env.render()
            print('at t', t)
            print('\t observation:', observation)
            print('\t action space:', env.action_space)

            # 4 actions: do nothing, left, center, right
            action = env.action_space.sample()
            if action == 1 or action == 3:
                # don't fire side engines
                action = 0
            observation, reward, done, info = env.step(action)

            print('\t action', action)
            print('\t reward', reward)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break
    env.close()