def main():
    """Main"""

    env_id = 'SpaceInvaders-v0'
    weight_fname = '/home/matthieu/temp/test.h5'

    env = ProcessedEnvironnement(
        env_id,
        outdir='/home/matthieu/temp/random-agent-results',
        wrappers_cond=True)
    env.seed(0)
    network = ConvNet(input_shape=(84, 84, 1),
                      nbr_action=env.action_space.n,
                      weight_fname=weight_fname)
    agent = DQNAgent(action_space=env.action_space,
                     network=network,
                     obs_shape=(84, 84, 1),
                     buffer_size=6,
                     decay=0.0,
                     epsilon=0.9)
    episode_count = 1
    reward = 0
    action_repetition_rate = 4
    action = 0
    for i in range(episode_count):
        ob = env.reset()
        done = True
        counter = 0
        while True:
            if counter % action_repetition_rate == 0:
                action = agent.act(ob, reward, done)
                print(action)
            ob, reward, done, _ = env.step(action)
            counter += 1
            if done:
                break

    # Close the env and write monitor result info to disk
    env.close()
Esempio n. 2
0
                     gamma=hyper_params['discount_factor'])

    eps_timesteps = hyper_params['eps_fraction'] * float(
        hyper_params['num_steps'])
    episode_rewards = [0.0]
    loss = [0.0]
    policy_actions = unpickle_object('action_map')

    state = env.reset()
    for t in range(hyper_params['num_steps']):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params['eps_start'] + fraction * (
            hyper_params['eps_end'] - hyper_params['eps_start'])
        sample = random.random()
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = np.random.randint(0, 4)

        env_action = policy_actions[action]
        next_state, reward, done, _ = env.step(env_action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)

        if t > hyper_params[
                'learning_starts'] and t % hyper_params['learning_freq'] == 0:
Esempio n. 3
0
def main():
    config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1, retro=True, realtime_mode=False, config=config)
    print(env.observation_space)
    print(env.action_space)

    hyper_params = {
        "seed": 6,  # which seed to use
        "replay-buffer-size": int(5e3),  # replay buffer size
        "learning-rate": 1e-4,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(1e6),  # total number of steps to run the environment for
        "batch-size": 32,  # number of transitions to optimize at the same time
        "learning-starts": 5000,  # number of steps before learning starts
        "learning-freq": 1,  # number of iterations between every optimization step
        "use-double-dqn": True,  # use double deep Q-learning
        "target-update-freq": 1000,  # number of iterations between every target network update
        "eps-start": 1.0,  # e-greedy start threshold
        "eps-end": 0.01,  # e-greedy end threshold
        "eps-fraction": 0.05,  # fraction of num-steps
        "print-freq": 10
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip"
    #env = gym.make(hyper_params["env"])
    env.seed(hyper_params["seed"])

    #env = NoopResetEnv(env, noop_max=30)
    #env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    # env = WarpFrame(env)
    env = PyTorchFrame(env)
    # env = ClipRewardEnv(env)
    # env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params["learning-rate"],
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"]
    )

    model_num = 500
    agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device)))

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    ep_nums = model_num

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()
        # TODO
        #  select random action if sample is less equal than eps_threshold
        # take step in env
        # add state, action, reward, next_state, float(done) to reply memory - cast done to float
        # add reward to episode_reward
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = env.action_space.sample()

        next_state, reward, done, _ = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)
            ep_nums += 1
            if ep_nums % 50 == 0:
                agent.save_models(ep_nums)
                plot(episode_rewards,ep_nums)




        if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0:
            agent.optimise_td_loss()

        if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0:
            agent.update_target_network()

        num_episodes = len(episode_rewards)

        if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[
            "print-freq"] == 0:
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(t))
            print("episodes: {}".format(num_episodes))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("% time spent exploring: {}".format(int(100 * eps_threshold)))
            print("********************************************************")


        #if done and ep_nums % 10 == 0:
        #    animate(env,agent,"anim/progress_"+str(ep_nums))
        #    state = env.reset()

    animate(env,agent,"anim/final")


    env.close()
Esempio n. 4
0
            torch.load(args.load_checkpoint_file))

    eps_timesteps = hyper_params["eps-fraction"] * \
        float(hyper_params["num-steps"])
    episode_rewards = [0.0]

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * \
            (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()

        if (sample > eps_threshold):
            # Exploit
            action = agent.act(state)
        else:
            # Explore
            action = env.action_space.sample()

        next_state, reward, done, info = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)

        if t > hyper_params[
                "learning-starts"] and t % hyper_params["learning-freq"] == 0:
Esempio n. 5
0
                     batch_size=hyper_params['batch-size'],
                     gamma=hyper_params['discount-factor'])

    eps_timesteps = hyper_params["eps-fraction"] * float(
        hyper_params["num-steps"])
    episode_rewards = [0.0]

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (
            hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()
        # TODO
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = env.action_space.sample()

        next_state, reward, done, _ = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)

        if t > hyper_params[
                "learning-starts"] and t % hyper_params["learning-freq"] == 0:
            agent.optimise_td_loss()