Ejemplo n.º 1
0
                           np.expand_dims(next_state, 2),
                           axis=2)

    # If our replay memory is full, pop the first element
    replay_memory.append(Transition(state, action, reward, next_state, done))
    if done:
        state = env.reset()
        state = pre_proc(state)
        state = np.stack([state] * 4, axis=2)
    else:
        state = next_state
print('Initialize replay buffer: done!')

# Record videos
env = Monitor(env,
              directory=monitor_path,
              resume=True,
              video_callable=lambda count: count % record_video_every == 0)
total_t = 0
for i_episode in range(num_episodes):
    loss = None
    state = env.reset()
    state = pre_proc(state)
    state = np.stack([state] * 4, axis=2)
    # One step in the environment
    for t in itertools.count():
        # Choose random action if not yet start learning
        epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]
        action = select_epilson_greedy_action(q_estimator, state, epsilon)

        next_state, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1 ??
Ejemplo n.º 2
0
def record_sessions(env_id, agent, n_actions):
    env = Monitor(gym.make(env_id), directory='videos', force=True)
    for _ in range(100):
        generate_agent_session(env, agent, n_actions)

    env.close()
Ejemplo n.º 3
0
               config=vars(args),
               name=experiment_name,
               monitor_gym=True,
               save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device(
    'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
env = gym.make(args.gym_id)
#env = wrap_atari(env)
env = ImgObsWrapper(env)

#env = gym.wrappers.RecordEpisodeStatistics(env) # records episode reward in `info['episode']['r']`
if args.capture_video:
    env = Monitor(env, f'videos/{experiment_name}')
#env = wrap_deepmind(
#    env,
#    clip_rewards=True,
#    frame_stack=True,
#    scale=False,
#)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
env.seed(args.seed)
env.action_space.seed(args.seed)
env.observation_space.seed(args.seed)
# respect the default timelimit
Ejemplo n.º 4
0
    def evaluate(self,
                 n_games=1,
                 save_path="./records",
                 use_monitor=True,
                 record_video=True,
                 verbose=True,
                 t_max=10000):
        """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward.

        :param save_path: where to save the report
        :param record_video: if True, records mp4 video
        :return: total reward (scalar)
        """
        env = self.make_env()

        if not use_monitor and record_video:
            raise warn(
                "Cannot video without gym monitor. If you still want video, set use_monitor to True"
            )

        if record_video:
            env = Monitor(env, save_path, force=True)
        elif use_monitor:
            env = Monitor(env,
                          save_path,
                          video_callable=lambda i: False,
                          force=True)

        game_rewards = []
        for _ in range(n_games):
            # initial observation
            observation = env.reset()
            # initial memory
            prev_memories = [
                np.zeros((1, ) + tuple(mem.output_shape[1:]),
                         dtype=get_layer_dtype(mem))
                for mem in self.agent.agent_states
            ]

            t = 0
            total_reward = 0
            while True:

                res = self.agent_step(
                    self.preprocess_observation(observation)[None, ...],
                    *prev_memories)
                action, new_memories = res[0], res[1:]

                observation, reward, done, info = env.step(action[0])

                total_reward += reward
                prev_memories = new_memories

                if done or t >= t_max:
                    if verbose:
                        print(
                            "Episode finished after {} timesteps with reward={}"
                            .format(t + 1, total_reward))
                    break
                t += 1
            game_rewards.append(total_reward)

        env.close()
        del env
        return game_rewards
Ejemplo n.º 5
0
model_path = utils.get_model_dir(args.model)

for test_mode in test_modes:

    # Generate environment

    if "_n" in args.env:
        env = gym.make(args.env,
                       pairs_dict=pairs_dict,
                       test_instr_mode=test_mode,
                       num_dists=args.num_dists)
    else:
        env = gym.make(args.env)

    demo_path = os.path.join(model_path, test_mode)
    env = Monitor(env, demo_path, _check_log_this, force=True)
    env.seed(args.seed)

    # Define agent
    agent = utils.load_agent(env=env,
                             model_name=args.model,
                             argmax=args.argmax,
                             env_name=args.env,
                             instr_arch=args.instr_arch)
    utils.seed(args.seed)

    print('\n')
    print(f'=== EVALUATING MODE: {test_mode} ===')

    # Run the agent
    done = False
Ejemplo n.º 6
0
import gym
from gym.wrappers import Monitor
from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

env = Monitor(gym.make('CartPole-v0'), './video', force=True)
env.reset()

done = False

while not done:
    env.render()
    action = env.action_space.sample()
    _, _, done, _ = env.step(action)

env.close()
display.stop()
Ejemplo n.º 7
0
def main_test(id):
    config(id)
    env = gym.make(id)
    env = env.unwrapped
    dqn = MyDQN(env)
    if id == 'CartPole-v0':
        T = 20000
    else:
        T = 2000

    count = 0
    train_result = []
    train_loss = []
    for i in range(2000):
        observation = env.reset()
        for j in range(T):
            action = dqn.action(observation, i)
            new_observation, reward, done, info = env.step(action)
            if id == 'CartPole-v0':
                r1 = (env.x_threshold -
                      abs(new_observation[0])) / env.x_threshold - 0.8
                r2 = (env.theta_threshold_radians - abs(
                    new_observation[2])) / env.theta_threshold_radians - 0.5
                reward = r1 + r2
                '''if j<2000:
                    reward=-200'''

            elif done:
                reward = 100
            dqn.perceive(observation, action, reward, new_observation, done)
            observation = new_observation
            if done == False and j != T - 1:
                continue
            train_result.append(j)

            if id == 'CartPole-v0':
                if done or j == T - 1:
                    if j > 5000:
                        count += 1
                    else:
                        count = 0
                    print(i, j)
                    break
            elif id == 'MountainCar-v0':
                print(i, j)
                if done and j < 300:
                    count += 1
                else:
                    count = 0
                break
            else:
                print(i, j)
                if done and j < 300:
                    count += 1
                else:
                    count = 0
                break
        train_loss.append(dqn.get_loss() / train_result[-1])
        if id == 'CartPole-v0' and count >= 5:
            break
        if id != 'CartPole-v0' and count >= 200:
            break
    print(train_loss)
    print(train_result)
    plt.plot(train_loss)
    plt.xlabel("round")
    plt.ylabel("loss")
    plt.show()
    if id != 'CartPole-v0':
        train_result = -np.array(train_result)
    plt.plot(train_result)
    plt.xlabel("round")
    plt.ylabel("reward")
    plt.show()

    if RECORD:
        env = Monitor(env, './cartpole-experiment-0201', force=True)
        observation = env.reset()
        for j in range(T):
            #env.render()
            action = dqn.best_action(observation)
            observation, reward, done, info = env.step(action)
        env.close()

    result = []
    for i in range(200):
        observation = env.reset()
        for j in range(T):
            #env.render()
            action = dqn.best_action(observation)
            observation, reward, done, info = env.step(action)
            if done or j == T - 1:
                print("test", j + 1)
                result.append(j + 1)
                break
    result = np.array(result)
    if id != 'CartPole-v0':
        result = -result
    plt.plot(result)
    plt.xlabel("round")
    plt.ylabel("reward")
    plt.show()
    print("mean", np.mean(result))
    print("var", np.std(result))
    print("len", len(result))
def main():

    global RENDER_DELAY

    assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json'

    gamename = sys.argv[1]

    if gamename.startswith("bullet"):
        RENDER_DELAY = True

    use_model = False

    game = config.games[gamename]

    if len(sys.argv) > 2:
        use_model = True
        filename = sys.argv[2]
        print("filename", filename)

    the_seed = 0
    if len(sys.argv) > 3:
        the_seed = int(sys.argv[3])
        print("seed", the_seed)

    model = make_model(game)
    print('model size', model.param_count)

    model.make_env(render_mode=render_mode)

    if use_model:
        model.load_model(filename)
    else:
        params = model.get_random_model_params(stdev=0.1)
        model.set_model_params(params)

    if final_mode:
        np.random.seed(the_seed)
        model.env.seed(the_seed)
        rewards = []

        for i in range(100):
            reward, steps_taken = simulate(model,
                                           train_mode=False,
                                           render_mode=False,
                                           num_episode=1)
            print(i, reward)
            rewards.append(reward[0])
        print("seed", the_seed, "average_reward", np.mean(rewards),
              "standard_deviation", np.std(rewards))
    else:
        if record_video:
            model.env = Monitor(model.env,
                                directory='/tmp/' + gamename,
                                video_callable=lambda episode_id: True,
                                write_upon_reset=True,
                                force=True)
        while (5):
            reward, steps_taken = simulate(model,
                                           train_mode=False,
                                           render_mode=render_mode,
                                           num_episode=1)
            print("terminal reward", reward, "average steps taken",
                  np.mean(steps_taken) + 1)
Ejemplo n.º 9
0
    output = np.squeeze(output, axis=0)
    stochastic_action = output + noise_process.sample()
    # bound to torcs scope
    bounded = np.clip(stochastic_action, action_space.low, action_space.high)
    return bounded


if __name__ == "__main__":
    tf.logging.info(
        "@@@  start ddpg training gym_bipedal_walker_v2 @@@ start time:{}".
        format(time.ctime()))
    # Generate a Torcs environment
    train_env = gym.make(id='BipedalWalker-v2')

    eval_monitor = Monitor(gym.make(id='BipedalWalker-v2'),
                           directory=DDPG_CFG.eval_monitor_dir,
                           video_callable=lambda x: False,
                           resume=True)

    mu = np.array([0.0, 0.0, 0.0, 0.0])
    # x0=np.array([0, 0.5, -0.1])
    theta = np.array([0.15, 0.15, 0.15, 0.15])
    sigma = np.array([0.3, 0.3, 0.3, 0.3])
    # x0 = np.array([0.1, 0.3, 0.1])
    # TODO greedy accel in the begining
    x0 = np.array([
        -0.2,
        0.2,
        0.2,
        0.2,
    ])
    noise_process = UO_Process(mu=mu, x0=x0, theta=theta, sigma=sigma, dt=1e-2)
Ejemplo n.º 10
0
def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env
Ejemplo n.º 11
0
def wrap_env(env):
    # wrapper for recording
    env = Monitor(env, './video', force=True)
    return env
            
            indices = np.array([i for i in range(self.batch_size)])
            action_state_value[[indices],[actions]] = next_action_state_value

            self.model.fit(states, action_state_value, epochs=1, verbose=0)

            
####################################################################################################
# Run

File_Epsilon = open(str(FILE_EPSILON), 'a+')
File_Rewards = open(str(FILE_REWARDS), 'a+')

env = gym.make('LunarLander-v2')
if RECORD == True:
    env = Monitor(env=env, directory=PATH_VIDEO, force=True)
env.seed(0)

action_space = env.action_space     .n
state_space  = env.observation_space.shape[0]
agent = Agent(action_space, state_space)
if path.exists(PATH_WEIGHTS):
    agent.model.load_weights(PATH_WEIGHTS)
    
rewards = []
    
for episode in range(EPISODES):
    state = env.reset()
    state = np.reshape(state,(1,state_space))
        
    score = 0
Ejemplo n.º 13
0
def main(argv=()):
    del argv  # Unused.

    # Build an environment
    
    # Create and record episode - remove Monitor statement if recording not desired
    env = Monitor(gym.make('one-random-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_random_evader', force=True)

    #Reset state
    state = env.reset()
    
    #Initialize Agent Parameters
    #Get observed state space
    observed_state_space = env.get_observed_state_space()
    #Set initial state distribution
    initial_state_dist = []
    initial_state = env.get_initial_state()
    for state in observed_state_space:
        if state == initial_state:
            initial_state_dist.append(1)
        else:
            initial_state_dist.append(0)
    #Get action space
    action_space = range(0, env.action_space.n)
    #Set action prior to uniform dist
    action_prior = []
    for action in action_space:
        action_prior.append(1/len(action_space))
    #Get reward function
    reward_function = env.get_reward_function()
    #Get transition function 
    transition_function = env.get_transition_function()
    #Set max trajectory length
    max_trajectory_length = 11 #needs to be greater than shortest distance to evader for any meaningful inference

    #Create Agent
    agent = infer.DiceInferenceEngine(observed_state_space, action_space, initial_state_dist, action_prior, reward_function, transition_function, max_trajectory_length)
    print("\nAgent created.\n")
    #Set current observed state to initial state
    uncolored_obs = initial_state
    #Initialize actions list
    actions = []
    print("\nInfering action " + str(0) + "\n")
    actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item())

    #Game Loop
    for t in range(0, 11):

        #Render
        env.render()
         
        #Delay to make video easier to watch
        #sleep(5)

        #Take action and get observations, rewards, termination from environment 
        observation, reward, done, info = env.step(actions[t]) 

        #If termination signal received, break out of loop
        if done:
            break

        #Pick next action based on agent's reasoning
        uncolored_obs = env.uncolor_board(observation)
        print("\nInfering action " + str(t + 1) + "\n")
        actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item())


 

    env.close()
Ejemplo n.º 14
0
def main():

    global RENDER_DELAY

    parser = argparse.ArgumentParser(
        description=('Train policy on OpenAI Gym environment '
                     'using pepg, ses, openes, ga, cma'))
    parser.add_argument('gamename',
                        type=str,
                        help='robo_pendulum, robo_ant, robo_humanoid, etc.')
    parser.add_argument('-f',
                        '--filename',
                        type=str,
                        help='json filename',
                        default='none')
    parser.add_argument('-e',
                        '--eval_steps',
                        type=int,
                        default=100,
                        help='evaluate this number of step if final_mode')
    parser.add_argument('-s',
                        '--seed_start',
                        type=int,
                        default=0,
                        help='initial seed')
    parser.add_argument('-w',
                        '--single_weight',
                        type=float,
                        default=-100,
                        help='single weight parameter')
    parser.add_argument('--stdev',
                        type=float,
                        default=2.0,
                        help='standard deviation for weights')
    parser.add_argument(
        '--sweep',
        type=int,
        default=-1,
        help='sweep a set of weights from -2.0 to 2.0 sweep times.')
    parser.add_argument('--lo',
                        type=float,
                        default=-2.0,
                        help='slow side of sweep.')
    parser.add_argument('--hi',
                        type=float,
                        default=2.0,
                        help='high side of sweep.')

    args = parser.parse_args()

    assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json'

    gamename = args.gamename

    use_model = False

    game = config.games[gamename]

    filename = args.filename
    if filename != "none":
        use_model = True
        print("filename", filename)

    the_seed = args.seed_start

    model = make_model(game)
    print('model size', model.param_count)

    eval_steps = args.eval_steps
    single_weight = args.single_weight
    weight_stdev = args.stdev
    num_sweep = args.sweep
    sweep_lo = args.lo
    sweep_hi = args.hi

    model.make_env(render_mode=render_mode)

    if use_model:
        model.load_model(filename)
    else:
        if single_weight > -100:
            params = model.get_single_model_params(
                weight=single_weight - game.weight_bias)  # REMEMBER TO UNBIAS
            print("single weight value set to", single_weight)
        else:
            params = model.get_uniform_random_model_params(
                stdev=weight_stdev) - game.weight_bias
        model.set_model_params(params)

    if final_mode:
        if num_sweep > 1:
            the_weights = np.arange(
                sweep_lo, sweep_hi + (sweep_hi - sweep_lo) / num_sweep,
                (sweep_hi - sweep_lo) / num_sweep)
            for i in range(len(the_weights)):
                the_weight = the_weights[i]
                params = model.get_single_model_params(
                    weight=the_weight - game.weight_bias)  # REMEMBER TO UNBIAS
                model.set_model_params(params)
                rewards = []
                for i in range(eval_steps):
                    reward, steps_taken = simulate(model,
                                                   train_mode=False,
                                                   render_mode=False,
                                                   num_episode=1,
                                                   seed=the_seed + i)
                    rewards.append(reward[0])
                print("weight", the_weight, "average_reward", np.mean(rewards),
                      "standard_deviation", np.std(rewards))
        else:
            rewards = []
            for i in range(eval_steps):
                ''' random uniform params
        params = model.get_uniform_random_model_params(stdev=weight_stdev)-game.weight_bias
        model.set_model_params(params)
        '''
                reward, steps_taken = simulate(model,
                                               train_mode=False,
                                               render_mode=False,
                                               num_episode=1,
                                               seed=the_seed + i)
                print(i, reward)
                rewards.append(reward[0])
            print("seed", the_seed, "average_reward", np.mean(rewards),
                  "standard_deviation", np.std(rewards))
    else:
        if record_video:
            model.env = Monitor(model.env,
                                directory='/tmp/' + gamename,
                                video_callable=lambda episode_id: True,
                                write_upon_reset=True,
                                force=True)
        for i in range(1):
            reward, steps_taken = simulate(model,
                                           train_mode=False,
                                           render_mode=render_mode,
                                           num_episode=1,
                                           seed=the_seed + i)
            print("terminal reward", reward, "average steps taken",
                  np.mean(steps_taken) + 1)
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Make model copier object
    estimator_copy = ModelParametersCopier(q_estimator, target_estimator)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # For 'system/' summaries, usefull to check if currrent process looks healthy
    current_process = psutil.Process()

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:, :, 1:],
                               np.expand_dims(next_state, 2),
                               axis=2)
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Add env Monitor wrapper
    env = Monitor(env,
                  directory=monitor_path,
                  video_callable=lambda count: count % record_video_every == 0,
                  resume=True)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                estimator_copy.make(sess)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # Calculate q values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.amax(q_values_next, axis=1)

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            tag="episode/reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            tag="episode/length")
        episode_summary.value.add(simple_value=current_process.cpu_percent(),
                                  tag="system/cpu_usage_percent")
        episode_summary.value.add(
            simple_value=current_process.memory_percent(memtype="vms"),
            tag="system/v_memeory_usage_percent")
        q_estimator.summary_writer.add_summary(episode_summary, i_episode)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    return stats
Ejemplo n.º 16
0
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold,
        replay_memory_size, epsilon_steps, epsilon_final, tau_actor, tau_actor_param, use_ornstein_noise,
        learning_rate_actor, learning_rate_actor_param, reward_scale, clip_grad, title, scale_actions,
        zero_index_gradients, split, layers, multipass, indexed, weighted, average, random_weighted, render_freq,
        action_input_layer, initialise_params, save_freq, save_dir, save_frames, visualise):

    env = gym.make('Goal-v0')
    env = GoalObservationWrapper(env)

    if save_freq > 0 and save_dir:
        save_dir = os.path.join(save_dir, title + "{}".format(str(seed)))
        os.makedirs(save_dir, exist_ok=True)
    assert not (save_frames and visualise)
    if visualise:
        assert render_freq > 0
    if save_frames:
        assert render_freq > 0
        vidir = os.path.join(save_dir, "frames")
        os.makedirs(vidir, exist_ok=True)

    if scale_actions:
        kickto_weights = np.array([[-0.375, 0.5, 0, 0.0625, 0],
                                   [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]])
        shoot_goal_left_weights = np.array([0.857346647646219686, 0])
        shoot_goal_right_weights = np.array([-0.857346647646219686, 0])
    else:
        xfear = 50.0 / PITCH_LENGTH
        yfear = 50.0 / PITCH_WIDTH
        caution = 5.0 / PITCH_WIDTH
        kickto_weights = np.array([[2.5, 1, 0, xfear, 0], [0, 0, 1 - caution, 0, yfear]])
        shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0])
        shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0])

    initial_weights = np.zeros((4, 17))
    initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:]
    initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:]
    initial_weights[2, 16] = shoot_goal_left_weights[1]
    initial_weights[3, 16] = shoot_goal_right_weights[1]

    initial_bias = np.zeros((4,))
    initial_bias[0] = kickto_weights[0, 0]
    initial_bias[1] = kickto_weights[1, 0]
    initial_bias[2] = shoot_goal_left_weights[0]
    initial_bias[3] = shoot_goal_right_weights[0]

    if not scale_actions:
        # rescale initial action-parameters for a scaled state space
        for a in range(env.action_space.spaces[0].n):
            mid = (env.observation_space.spaces[0].high + env.observation_space.spaces[0].low) / 2.
            initial_bias[a] += np.sum(initial_weights[a] * mid)
            initial_weights[a] = initial_weights[a]*env.observation_space.spaces[0].high - initial_weights[a] * mid

    env = GoalFlattenedActionWrapper(env)
    if scale_actions:
        env = ScaledParameterisedActionWrapper(env)
    env = ScaledStateWrapper(env)
    dir = os.path.join(save_dir, title)
    env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True)
    env.seed(seed)
    np.random.seed(seed)

    assert not (split and multipass)
    agent_class = PDQNAgent
    if split:
        agent_class = SplitPDQNAgent
    elif multipass:
        agent_class = MultiPassPDQNAgent
    agent = agent_class(
                       observation_space=env.observation_space.spaces[0], action_space=env.action_space,
                       batch_size=batch_size,
                       learning_rate_actor=learning_rate_actor,  # 0.0001
                       learning_rate_actor_param=learning_rate_actor_param,  # 0.001
                       epsilon_steps=epsilon_steps,
                       epsilon_final=epsilon_final,
                       gamma=gamma,
                       clip_grad=clip_grad,
                       indexed=indexed,
                       average=average,
                       random_weighted=random_weighted,
                       tau_actor=tau_actor,
                       weighted=weighted,
                       tau_actor_param=tau_actor_param,
                       initial_memory_threshold=initial_memory_threshold,
                       use_ornstein_noise=use_ornstein_noise,
                       replay_memory_size=replay_memory_size,
                       inverting_gradients=inverting_gradients,
                       actor_kwargs={'hidden_layers': layers, 'output_layer_init_std': 1e-5,
                                     'action_input_layer': action_input_layer,},
                       actor_param_kwargs={'hidden_layers': layers, 'output_layer_init_std': 1e-5,
                                           'squashing_function': False},
                       zero_index_gradients=zero_index_gradients,
                       seed=seed)

    if initialise_params:
        agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias)
    print(agent)
    max_steps = 150
    total_reward = 0.
    returns = []
    start_time = time.time()
    video_index = 0
    for i in range(episodes):
        if save_freq > 0 and save_dir and i % save_freq == 0:
            agent.save_models(os.path.join(save_dir, str(i)))

        state, _ = env.reset()
        state = np.array(state, dtype=np.float32, copy=False)
        act, act_param, all_action_parameters = agent.act(state)
        action = pad_action(act, act_param)

        if visualise and i % render_freq == 0:
            env.render()

        episode_reward = 0.
        agent.start_episode()
        for j in range(max_steps):
            ret = env.step(action)
            (next_state, steps), reward, terminal, _ = ret
            next_state = np.array(next_state, dtype=np.float32, copy=False)

            next_act, next_act_param, next_all_action_parameters = agent.act(next_state)
            next_action = pad_action(next_act, next_act_param)
            r = reward * reward_scale
            agent.step(state, (act, all_action_parameters), r, next_state,
                       (next_act, next_all_action_parameters), terminal, steps)
            act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters
            action = next_action
            state = next_state
            episode_reward += reward

            if visualise and i % render_freq == 0:
                env.render()

            if terminal:
                break
        agent.end_episode()

        if save_frames:
            video_index = env.unwrapped.save_render_states(vidir, title, video_index)

        returns.append(episode_reward)
        total_reward += episode_reward
        if (i + 1) % 100 == 0:
            print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(str(i + 1), total_reward / (i + 1),
                                                         (np.array(returns) == 50.).sum() / len(returns)))
    end_time = time.time()
    print("Training took %.2f seconds" % (end_time - start_time))
    env.close()

    if save_freq > 0 and save_dir:
        agent.save_models(os.path.join(save_dir, str(i)))

    returns = env.get_episode_rewards()
    np.save(os.path.join(dir, title + "{}".format(str(seed))), returns)

    if evaluation_episodes > 0:
        print("Evaluating agent over {} episodes".format(evaluation_episodes))
        agent.epsilon_final = 0.
        agent.epsilon = 0.
        agent.noise = None
        evaluation_returns = evaluate(env, agent, evaluation_episodes)
        print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns))
        print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns))
        np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[total_t])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # # Record videos
    env = Monitor(env,
    			  monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets
            # This is where Double Q-Learning comes in!
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    # env.monitor.close()
    return stats
Ejemplo n.º 18
0
def record_videos(env, path="videos"):
    return Monitor(env, path, force=True, video_callable=lambda episode: True)
Ejemplo n.º 19
0
 def wrap_env(self):
     self.env = Monitor(self.env, './video', force=True)
     return self.env
Ejemplo n.º 20
0
def main(_):
    with tf.Session() as sess:

        env = gym.make(ENV_NAME)
        # np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high

        # Ensure action bound is symmetric
        #assert(env.action_space.high == -env.action_space.low)

        actor2 = ActorNetwork2(sess, state_dim, action_dim, action_bound,
                               ACTOR_LEARNING_RATE, TAU)

        critic2 = CriticNetwork2(sess, state_dim, action_dim, action_bound,
                                 CRITIC_LEARNING_RATE, TAU,
                                 actor2.get_num_trainable_vars())

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = Monitor(env,
                              MONITOR_DIR,
                              video_callable=False,
                              force=True)
            else:
                env = Monitor(env, MONITOR_DIR, force=True)

        train(sess, env, actor2, critic2)

        # if UPLOAD_GYM_RESULTS:
        #     gym.upload(MONITOR_DIR, api_key=GYM_API_KEY)

        # net_params = sess.run(actor.update_target_net_params)

        # f1 = open(ACTOR_DIR1,'w')
        # f2 = open(ACTOR_DIR2, 'w')
        # f3 = open(ACTOR_DIR3, 'w')
        #
        # #Network1 parameters storing
        # for i in range(400):
        #     for j in range(4):
        #         f1.write('%.8f \n' %net_params[0][j][i])
        #     f1.write('%.8f \n' %net_params[1][i])
        #
        # # Network2 parameters storing
        # for i in range(300):
        #     for j in range(400):
        #         f2.write('%.8f \n' %net_params[2][j][i])
        #     f2.write('%.8f \n' %net_params[3][i])
        #
        # # Network3 parameters storing
        # for i in range(1):
        #     for j in range(300):
        #         f3.write('%.8f \n' %net_params[4][j][i])
        #     f3.write('%.8f \n' %net_params[5][i])

        plt.figure(1)
        plt.subplot(121)
        plt.title('Reward')
        plt.plot(REWARD)

        plt.subplot(122)
        plt.title('Qmax average')
        plt.plot(QMAX)
        plt.show()
Ejemplo n.º 21
0

def _play_randomly(env):
    env.reset()
    env.render(mode="human")
    done = False
    while not done:
        time.sleep(0.01)
        env.render(mode="human")
        obs, r, done, info = env.step(
            env.action_space.sample())  # take a random action
    env.close()


if __name__ == '__main__':
    args = parse_arguments()
    config = BreakoutConfiguration(
        brick_rows=args.rows,
        brick_cols=args.columns,
        fire_enabled=args.fire,
        ball_enabled=not args.disable_ball,
    )
    env = BreakoutDictSpace(config)
    if args.record:
        env = Monitor(env, args.output_dir)

    if args.random:
        _play_randomly(env)
    else:
        env.play()
Ejemplo n.º 22
0
from gym.wrappers import Monitor
from gym.scoreboard.scoring import score_from_local

from gym_numgrid.wrappers import *
from examples.agents import *

red = '\033[91m'
yellow = '\033[93m'
green = '\033[32m'
endc = '\033[0m'

numgrid = gym.make('NumGrid-v0')
numgrid = DirectionWrapper(numgrid)

experiment_path = '/tmp/numgrid-direction-random'
env = Monitor(numgrid, experiment_path, force=True)

agent = RandomAgent(env.action_space)

reward = 0
info = {}

for i_episode in range(env.spec.trials):
    print("\n********* EPISODE", i_episode, "**********\n")
    observation = env.reset()
    done = False
    while not done:
        env.render()
        action = agent.act(observation, reward, done, info)
        digit = numgrid.action(action)[0]
        color = ''
Ejemplo n.º 23
0
def deep_q_learning(sess, 
					env, 
					q_estimator, 
					target_estimator, 
					state_processor, 
					num_episodes, 
					experiment_dir, 
					replay_memory_size=500000, 
					replay_memory_init_size=50000, 
					update_target_estimator_every=10000, 
					discount_factor=0.99, 
					epsilon_start=1.0,
					epsilon_end=0.1, 
					epsilon_decay_steps=500000,
					batch_size=32, 
					record_video_every=50):
	"""
	DQN algorithm with fff-policy Temporal Differnce control 
	returns EpisodeStats object with 2 numpy arrays for episode_lengths and episode_rewards
	"""

	Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

	replay_memory = []

	# useful statistics
	stats = plotting.EpisodeStats(
		episode_lengths = np.zeros(num_episodes), 
		episode_rewards = np.zeros(num_episodes))

	# directories for checkpoints and summaries
	checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
	checkpoint_path = os.path.join(checkpoint_dir, "model")
	monitor_path = os.path.join(experiment_dir, "monitor")


	if not os.path.exists(checkpoint_dir):
		os.makedirs(checkpoint_dir)
	if not os.path.exists(monitor_path):
		os.makedirs(monitor_path)

	saver = tf.train.Saver()
	# Load a previous checkpoint if we find one
	latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
	if latest_checkpoint:
		print("Loading model checkpoint {}...\n".format(latest_checkpoint))
		saver.restore(sess, latest_checkpoint)


	# get current time step 
	total_t = sess.run(tf.train.get_global_step())

	# epsilon decay schedule 
	epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

	# q policy we are following
	policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))


	# load initial experience into replay memory 
	print("Populating replay memory...")
	state = env.reset()
	state = state_processor.process(sess, state)
	state = np.stack([state] * 4, axis = 2)
	for i in range(replay_memory_init_size):
		if i % 1000 == 0:
			print("iteration " + str(i))
		# according to policy, create a action probability array 
		action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
		# randomly select an action according to action probs from policy 
		action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs)
		# openAI gym take a step in action space
		next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
		# process image data 
		next_state = state_processor.process(sess, next_state)
		next_state = np.append(state[:,:,1:], np.expand_dims(next_state,2), axis=2)
		# add action to replay memory 
		replay_memory.append(Transition(state, action, reward, next_state, done))
		if done:
			# if found goal, start over
			state = env.reset()
			state = state_processor.process(sess, state) 
			state = np.stack([state] * 4, axis = 2)

		else:
			# if not found goal, update state to next state
			state = next_state

	# record videos 
	# ad env monitor wrapper 
	env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)

	for i_episode in range(num_episodes):
		# save the current checkpoint
		if i_episode % 100 == 0:
			print ("episode: " + str(i_episode))
		saver.save(tf.get_default_session(), checkpoint_path)

		# reset openAI environment 
		state = env.reset()
		state = state_processor.process(sess, state)
		state = np.stack([state] * 4, axis=2)
		loss = None
		# main forloop after loading initial state 
		for t in itertools.count():

			# epsilon for this timestep 
			epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

			# add epsilon to tensorboard 
			episode_summary = tf.Summary()
			episode_summary.value.add(simple_value=epsilon, tag="epsilon")
			q_estimator.summary_writer.add_summary(episode_summary, total_t)

			# maybe update the target estimator 
			# update means copying parameters from q estimator -> target estimator 
			if total_t % update_target_estimator_every == 0:
				copy_model_parameters(sess, q_estimator, target_estimator)
				print("\nCopied model parameters to target network.")

			# Print out which step we're on, useful for debugging.
			print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
				t, total_t, i_episode + 1, num_episodes, loss), end="")
			sys.stdout.flush()


			# take the next step in the environment 
			# similar to earlier when loading replay memory with first step 
			action_probs = policy(sess, state, epsilon)
			action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs)
			next_state, rewar, done, _ = env.step(VALID_ACTIONS[action])
			next_state = state_processor.process(sess, next_state)
			next_state = np.append(state[:,:,1:], np.expand_dims(next_state,2), axis=2)

			# if replay memory is full, pop
			if len(replay_memory) == replay_memory_size:
				replay_memory.pop(0)

			# save transition to replay memory 
			replay_memory.append(Transition(state, action, reward, next_state, done))

			# update statistics
			stats.episode_rewards[i_episode] += reward
			stats.episode_lengths[i_episode] = t

			# sample minibatch from replay memory 
			samples = random.sample(replay_memory, batch_size)
			states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

			# calculate qvalues and targets 
			# Q ALGO RIGHT HERE LMAO
			q_values_next = target_estimator.predict(sess, next_states_batch)
			targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.max(q_values_next, axis=1)


			# gradient descent 
			states_batch = np.array(states_batch)
			loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

			if done:
				break

			state = next_state
			total_t += 1



	# Add summaries to tensorboard
	episode_summary = tf.Summary()
	episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
	episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
	q_estimator.summary_writer.add_summary(episode_summary, total_t)
	q_estimator.summary_writer.flush()

	yield total_t, plotting.EpisodeStats(
		episode_lengths=stats.episode_lengths[:i_episode+1],
		episode_rewards=stats.episode_rewards[:i_episode+1])

	return stats 
Ejemplo n.º 24
0
    display.display(plt.gcf())


def loop(context, i):
    env, agent = context
    control = agent(env.state)
    _, reward, _, _ = env.step(control)
    show_state(env, step=i)
    return (env, agent), reward


# ILQR
agent = ILQR()
agent.train(Acrobot(horizon=10), 10)

# for loop version
T = 75
env = Acrobot()
env = Monitor(env,
              './video',
              video_callable=lambda episode_id: True,
              force=True)
print(env.reset())
reward = 0
for i in range(T):
    (env, agent), r = loop((env, agent), i)
    reward += r

reward_forloop = reward
print('reward_forloop = ' + str(reward_forloop))
env.close()
Ejemplo n.º 25
0
if __name__ == '__main__':
    # You can optionally set up the logger. Also fine to set the level
    # to logging.DEBUG or logging.WARN if you want to change the
    # amount of output.
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    env = gym.make('FlappyBird-v0' if len(sys.argv) < 2 else sys.argv[1])

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = 'random-agent-results'
    env = Monitor(env, directory=outdir, force=True)

    # This declaration must go *after* the monitor call, since the
    # monitor's seeding creates a new action_space instance with the
    # appropriate pseudorandom number generator.
    env.seed(0)
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        #
        while True:
Ejemplo n.º 26
0
from gym.wrappers import Monitor
from sklearn import preprocessing

from atari_wrappers import FrameStack, EpisodicLifeEnv
from rl_baseline.mario_util import make_env

# raw_env = retro.make("SuperMarioBros-Nes")
from small_evo.wrappers import AutoRenderer

monitor = None
action_repeat = True
episodic_life = True
render = 2
env = retro.make("SuperMarioBros-Nes")
if monitor is not None:
    env = Monitor(env, monitor)
if render is not None:
    env = AutoRenderer(env, auto_render_period=render)
if action_repeat:
    env = FrameStack(env, 4)
if episodic_life:
    env = EpisodicLifeEnv(env, [0] * 9)
raw_env = env.unwrapped
# env = AllowBacktracking(make_env(stack=False, scale_rew=False))
first_obs = env.reset()
order = ["coins", "levelHi", "levelLo", "lives", "score", "scrolling", "time", "xscrollHi", "xscrollLo"]

index_right = raw_env.buttons.index("RIGHT")
index_a = raw_env.buttons.index("A")
index_b = raw_env.buttons.index("B")
infos = []
import gym
import gym_traffic
from gym.wrappers import Monitor
import time

env = gym.make('Traffic-Simple-gui-v0')
from tqdm import tqdm
monitor = False
# env = gym.make('Traffic-Simple-cli-v0')

#TODO: Change simulation step size
#TODO: Add more traffic flows
#TODO: Scene image generation

if monitor:
    env = Monitor(env, "output/traffic/simple/random", force=True)
for i_episode in tqdm(range(500)):
    observation = env.reset()
    total_reward = 0
    for t in tqdm(range(1000)):
        # env.render()
        # print(observation)
        # print "\n Observation: {}".format(observation)
        env = env.unwrapped
        action = env.action_from_ttc()
        # print "\n Action: {}".format(action)
        # time.sleep(1)
        observation, reward, done, info = env.step(action)
        total_reward += reward
        # print (observation)
        # print "---------------- Observations ----------------"
Ejemplo n.º 28
0
def train(path, env):
    env = Monitor(env, path, video_callable=video_callable, force=True)
    agent = Agent(env)
    agent.train()
    return agent
Ejemplo n.º 29
0
        return out


generator = RoombaMazeGenerator()
maze = Maze(generator)
print(maze.to_value())

motion = Motion()
motion.add('north', [-1, 0])
motion.add('south', [1, 0])
motion.add('west', [0, -1])
motion.add('east', [0, 1])

env = RoombaEnv(maze, motion)
img = env.render('rgb_array')
plt.imshow(img)
plt.show()

from gym.wrappers import Monitor
from mazelab.solvers import dijkstra_solver

actions = dijkstra_solver(np.array(env.maze.to_impassable()), env.motion,
                          env.state.positions[0], env.goal.positions[0])
env = Monitor(env, directory='./', force=True)
env.reset()
for action in actions:
    env.step(action)

env.close()
Ejemplo n.º 30
0
        acts.append(action)
        rews.append(reward)

    return obs, acts, rews


def process_rewards(rews):
    """Rewards -> Advantages for one episode. """

    # total reward: length of episode
    return [len(rews)] * len(rews)


monitor_dir = '/tmp/cartpole_exp1'

monitor = Monitor(env, monitor_dir, force=True)

sess.run(tf.global_variables_initializer())
b_obs, b_acts, b_rews = [], [], []

# for _ in range(eparams['ep_per_batch']):

obs, acts, rews = policy_rollout(env)

print('Episode steps: {}'.format(len(obs)))

b_obs.extend(obs)
b_acts.extend(acts)

advantages_rew = process_rewards(rews)
b_rews.extend(advantages_rew)