Esempio n. 1
0
def test_single_training():

    numberOfCells = 10 # in each axis
    startingPosition = (4, 5) # head
    foodPosition = (3, 6)

    env = Environment(numberOfCells, deterministic=True)
    agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000)
    state = env.reset(startingPosition, foodPosition)
    agent.reset_convolutional_layers()
    full_state = agent.get_convolutional_layers(state)
    loss10 = -1
    action10 = -1

    maxsteps = 10

    for step in range(maxsteps):
        action = agent.get_exploration_action()
        next_state, reward, done = env.step(action, food_position=(1, 1))
        assert(not done)
        full_next_state = agent.get_convolutional_layers(next_state)
        assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers))
        agent.save_transition(full_state, action, reward, full_next_state, done)
        current_loss = agent.train()
        full_state = full_next_state

    loss10 = current_loss
    action10 = action

    assert(loss10 == 0.006804642267525196)
    assert(action10 == 0)
Esempio n. 2
0
def test_smoke():
    # just runs the code - no assetions

    numberOfCells = 10 # in each axis
    startingPosition = (4, 5) # head
    foodPosition = (3, 6)

    env = Environment(numberOfCells)
    agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000)
    state = env.reset(startingPosition, foodPosition)
    agent.reset_convolutional_layers()
    full_state = agent.get_convolutional_layers(state)

    maxsteps = 2

    for step in range(maxsteps):
        action = agent.get_exploration_action()
        next_state, reward, done = env.step(action, food_position=(1, 1))
        full_next_state = agent.get_convolutional_layers(next_state)
        assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers))
        agent.save_transition(full_state, action, reward, full_next_state, done)
        current_loss = agent.train()

        if (step == 0):
            action1 = action
            loss1 = current_loss

        full_state = full_next_state

    loss2 = current_loss
    action2 = action
Esempio n. 3
0
    def __init__(self, env, do_render, num_threads, gamma, lr,
                 global_max_episode):

        state_size, action_size = env.observation_space.shape[
            0], env.action_space.n

        self.qnetwork_global = QNetwork(state_size, action_size)  #.to(device)
        self.qnetwork_global.share_memory()

        self.qnetwork_target = QNetwork(state_size, action_size)  #.to(device)
        self.qnetwork_target.share_memory()

        self.agents = [
            DQNAgent(id=id,
                     env=env,
                     do_render=do_render,
                     state_size=state_size,
                     action_size=action_size,
                     n_episodes=global_max_episode,
                     lr=lr,
                     gamma=gamma,
                     update_every=UPDATE_EVERY + num_threads,
                     global_network=self.qnetwork_global,
                     target_network=self.qnetwork_target)
            for id in range(num_threads)
        ]
Esempio n. 4
0
def main():
    config = Config()
    env = Environment(config)
    agent = DQNAgent(config)
    trainer = Trainer(config, env, agent)
    trainer.train()
    trainer.play()
Esempio n. 5
0
def test_multiepisode_training():

    numberOfCells = 10 # in each axis
    startingPosition = (4, 5) # head
    foodPosition = (3, 6)

    env = Environment(numberOfCells, deterministic=True)
    state_size = env.state_size
    action_size = Actions.action_size # 3
    agent = DQNAgent(state_size=state_size, action_size=action_size, deterministic=True, batch_size=24, memory_limit=2000)

    losses = [-1, -1, -1, -1]
    done = False

    episodes = 4
    maxsteps = 9

    for e in range(episodes):

        state = env.reset(startingPosition, foodPosition)
        agent.reset_convolutional_layers()
        full_state = agent.get_convolutional_layers(state)
        loss = 0

        for step in range(maxsteps):
            action = agent.get_exploration_action()
            next_state, reward, done = env.step(action, food_position=(1, 1)) # generation on (1, 1) happens once over the test
            full_next_state = agent.get_convolutional_layers(next_state)
            assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers))
            agent.save_transition(full_state, action, reward, full_next_state, done)

            current_loss = agent.train()
            loss += current_loss

            full_state = full_next_state

        losses[e] = loss

    assert(losses[0] == 3.9618697417899966)
    assert(losses[1] == 0.044194952584803104)
    assert(losses[2] == 0.1333141174982302)
    assert(losses[3] == 2.834151452407241)
def main():
    """Main"""

    env_id = 'SpaceInvaders-v0'
    weight_fname = '/home/matthieu/temp/test.h5'

    env = ProcessedEnvironnement(
        env_id,
        outdir='/home/matthieu/temp/random-agent-results',
        wrappers_cond=True)
    env.seed(0)
    network = ConvNet(input_shape=(84, 84, 1),
                      nbr_action=env.action_space.n,
                      weight_fname=weight_fname)
    agent = DQNAgent(action_space=env.action_space,
                     network=network,
                     obs_shape=(84, 84, 1),
                     buffer_size=6,
                     decay=0.0,
                     epsilon=0.9)
    episode_count = 1
    reward = 0
    action_repetition_rate = 4
    action = 0
    for i in range(episode_count):
        ob = env.reset()
        done = True
        counter = 0
        while True:
            if counter % action_repetition_rate == 0:
                action = agent.act(ob, reward, done)
                print(action)
            ob, reward, done, _ = env.step(action)
            counter += 1
            if done:
                break

    # Close the env and write monitor result info to disk
    env.close()
Esempio n. 7
0
def train(network=None, expert_data_path=None):
    env = make_env()
    env_spec = acme.make_environment_spec(env)

    if network is None:
        network = make_dqn(env_spec.actions.num_values)

    expert_data = None
    if expert_data_path is not None:
        with open(expert_data_path, "rb") as handle:
            expert_data = pickle.load(handle)
        num_timesteps = np.sum([1 + len(ep["mid"]) for ep in expert_data])
        print(f"Using expert data from {expert_data_path}. "
              f"Episodes: {len(expert_data)}. Timesteps: {num_timesteps}.")

    agent = DQNAgent(environment_spec=env_spec,
                     network=network,
                     batch_size=32,
                     learning_rate=1e-4,
                     logger=loggers.NoOpLogger(),
                     min_replay_size=1000,
                     max_replay_size=int(1e5),
                     target_update_period=2500,
                     epsilon=tf.Variable(0.025),
                     n_step=20,
                     discount=0.97,
                     expert_data=expert_data)

    loop = EnvironmentLoop(environment=env,
                           actor=agent,
                           module2save=network)
    reward_history = loop.run(num_steps=int(1e6),
                              render=True,
                              checkpoint=True,
                              checkpoint_freq=15)

    avg_hist = [np.mean(reward_history[i:(i+50)])
                for i in range(len(reward_history) - 50)]
    plt.plot(list(range(len(avg_hist))), avg_hist)
    plt.show()

    env.close()
    return network
Esempio n. 8
0
def main():
    random.seed(SEED)

    # Create agent-directory
    execution_time = str(round(time.time()))

    agent_dir = os.path.join("agents", ALGORITHM,
                             ENVIRONMENT + "_" + execution_time)
    os.makedirs(agent_dir)

    # Initialize utils, environment and agent
    utils = Utils(agent_dir, FRAMES_PER_EPOCH, EPOCHS * FRAMES_PER_EPOCH)
    env = gym.make(ENVIRONMENT)

    try:
        env.env.frameskip = FRAMESKIP
        env.env.ale.setFloat("repeat_action_probability", REPEAT_ACTION_PROB)
        if ALGORITHM == 'MFEC':
            if AGENT_PATH:
                agent = MFECAgent.load(AGENT_PATH)
            else:
                agent = MFECAgent(
                    ACTION_BUFFER_SIZE,
                    K,
                    DISCOUNT,
                    EPSILON,
                    SCALE_HEIGHT,
                    SCALE_WIDTH,
                    STATE_DIMENSION,
                    range(env.action_space.n),
                    SEED,
                )
        else:
            agent = DQNAgent(env.action_space.n)
            if AGENT_PATH:
                agent.load(AGENT_PATH)

        run_algorithm(agent, agent_dir, env, utils)

    finally:
        utils.close()
        env.close()
Esempio n. 9
0
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=args.use_double_dqn,
        lr=args.lr,
        batch_size=args.batch_size,
        gamma=args.gamma
    )

    eps_timesteps = args.eps_fraction * float(args.num_steps)
    episode_rewards = [0.0]
    loss = [0.0]

    state = env.reset()
    for t in range(args.num_steps):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = args.eps_start + fraction * (args.eps_end - args.eps_start)
        sample = random.random()
        if sample > eps_threshold:
Esempio n. 10
0
def main():
    config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1, retro=True, realtime_mode=False, config=config)
    print(env.observation_space)
    print(env.action_space)

    hyper_params = {
        "seed": 6,  # which seed to use
        "replay-buffer-size": int(5e3),  # replay buffer size
        "learning-rate": 1e-4,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(1e6),  # total number of steps to run the environment for
        "batch-size": 32,  # number of transitions to optimize at the same time
        "learning-starts": 5000,  # number of steps before learning starts
        "learning-freq": 1,  # number of iterations between every optimization step
        "use-double-dqn": True,  # use double deep Q-learning
        "target-update-freq": 1000,  # number of iterations between every target network update
        "eps-start": 1.0,  # e-greedy start threshold
        "eps-end": 0.01,  # e-greedy end threshold
        "eps-fraction": 0.05,  # fraction of num-steps
        "print-freq": 10
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip"
    #env = gym.make(hyper_params["env"])
    env.seed(hyper_params["seed"])

    #env = NoopResetEnv(env, noop_max=30)
    #env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    # env = WarpFrame(env)
    env = PyTorchFrame(env)
    # env = ClipRewardEnv(env)
    # env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params["learning-rate"],
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"]
    )

    model_num = 500
    agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device)))

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    ep_nums = model_num

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()
        # TODO
        #  select random action if sample is less equal than eps_threshold
        # take step in env
        # add state, action, reward, next_state, float(done) to reply memory - cast done to float
        # add reward to episode_reward
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = env.action_space.sample()

        next_state, reward, done, _ = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)
            ep_nums += 1
            if ep_nums % 50 == 0:
                agent.save_models(ep_nums)
                plot(episode_rewards,ep_nums)




        if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0:
            agent.optimise_td_loss()

        if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0:
            agent.update_target_network()

        num_episodes = len(episode_rewards)

        if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[
            "print-freq"] == 0:
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(t))
            print("episodes: {}".format(num_episodes))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("% time spent exploring: {}".format(int(100 * eps_threshold)))
            print("********************************************************")


        #if done and ep_nums % 10 == 0:
        #    animate(env,agent,"anim/progress_"+str(ep_nums))
        #    state = env.reset()

    animate(env,agent,"anim/final")


    env.close()
Esempio n. 11
0
# We can use proportional or rank-based prioritized replay (proportional seems to be prefered by many papers)
# Simple, non-prioritized replay is also implemented

alpha_scheduler = dqn.annealing_schedules.Constant(0.7)
beta_scheduler = dqn.annealing_schedules.Constant(0.5)
memory = dqn.experience_replay.Proportional(capacity=50000,
                                            alpha_scheduler=alpha_scheduler,
                                            beta_scheduler=beta_scheduler)
##memory = dqn.experience_replay.RankBased(capacity=50000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler)
##memory = dqn.experience_replay.Simple(capacity=50000)

# Below we add n-step learning with the parameter n-step
# Not yet supported: Frame skipping will be added in the future
agent = DQNAgent(network=q_func,
                 observation_space=env.observation_space,
                 action_space=env.action_space,
                 action_selection=action_selection,
                 loss=loss,
                 update_target=update_target,
                 memory=memory,
                 n_step=3,
                 update_target_network_frequency=2000)

agent.train(env, num_timesteps=num_steps, render=False)

# We can save and load an agent
# Note: Currently this only saves the weights of the network -- the entire agent must be recreated (or reused, as would happen here) before calling load
##agent.save('/tmp/save_test/test')
##agent.load('/tmp/save_test/test')
Esempio n. 12
0
def train_snake():

    # todo: put all these parameters using a configuration file
    numberOfCells = 10  # in each axis
    startingPosition = (4, 5)  # head
    foodPosition = (3, 6)
    max_steps_allowed = 1000

    env = Environment(numberOfCells)
    state_size = env.state_size  #(numberOfCells x numberOfCells)
    action_size = Actions.action_size  # 3
    agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     batch_size=32,
                     memory_limit=6000,
                     number_of_channels=5)

    episodes = 30000
    decay = 0.9 / episodes * 2  # changes epsilon : explore vs exploit

    epochs = []
    losses = []
    steps_list = []

    with open('training_data', 'w') as f:

        for e in range(episodes):

            state = env.reset(startingPosition)
            #print('state array reset: \n', state)

            agent.reset_convolutional_layers()
            full_state = agent.get_convolutional_layers(state)
            loss = 0.0
            steps = 0
            done = False
            episode_reward = 0

            while not done:

                # state at this point is just a 2D array
                action = agent.get_action(full_state)
                #action = agent.get_raction()
                #print('action chosen: ', action)

                # step onto the next state
                next_state, reward, done = env.step(action)

                #print('state array after step ', steps, ' : \n', next_state)
                #print('reward returned: ', reward)
                #print('next state: ', next_state)

                # we store the next_state in (1,H,W,C)
                full_next_state = agent.get_convolutional_layers(next_state)
                #print('full next state: \n:', full_next_state)
                #assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers))

                # save S,A,R,S' to experience
                # full states are a snapshot - copies of the state
                agent.save_transition(full_state, action, reward,
                                      full_next_state, done)
                episode_reward += reward

                # use alternative policy to train model - rely on experience only
                current_loss = agent.train()
                #print('current_loss: ', current_loss)
                loss += current_loss
                full_state = full_next_state

                # limit max steps - avoid something bad
                steps += 1
                if steps >= max_steps_allowed:
                    done = True

            # next episode
            if agent.epsilon > 0.1:
                agent.epsilon -= decay  # agent slowly reduces exploring

            print(
                'episode: {:5d} steps: {:3d} epsilon: {:.3f} loss: {:8.4f} reward: {:3d} fruits: {:2d}'
                .format(e, steps, agent.epsilon, loss, episode_reward,
                        env.fruits_eaten))
            f.write('{:5d} {:3d} {:8.4f} {:4d} {:2d}\n'.format(
                e, steps, loss, episode_reward, env.fruits_eaten))

        agent.model.save('trained_snake.model')
Esempio n. 13
0
    action='store',
    help="Please specify the agent you wish to use, either DQN or A3C",
    required=True)
parser.add_argument(
    "-n",
    "--mode",
    type=str,
    action='store',
    help="Please specify the mode you wish to run, either train or eval",
    required=True)

args = parser.parse_args()
print(args)

if args.model == 'DQN':
    agent = DQNAgent()

    if args.mode == 'train':
        agent.train()

    if args.mode == 'eval':
        agent.Evaluate()

if args.model == 'A3C':
    agent = A3CGlobalAgent()

    if args.mode == 'train':
        agent.train()

    if args.mode == 'eval':
        agent.Evaluate()
Esempio n. 14
0
import dqn.experience_replay
import tensorflow.contrib.layers as layers

env = gym.make('CartPole-v1')
num_steps=200000

# Here we combine the same improvements from Rainbow, but use QR instead of C51
# Note that we are still using a DistributionalQNetwork, but this network uses n as the number of quantiles rather than the number of atoms
# TODO: Do we want to allow noisy_net=False ? Does this make sense or not ?
q_func = nn.DistributionalQNetwork([64], env.action_space.n, n=75, noisy_net=True, dueling=[32])
epsilon_scheduler = dqn.annealing_schedules.Constant(0)
action_selection = dqn.algorithms.EpsilonGreedy(epsilon_scheduler)
loss = dqn.algorithms.QuantileRegressionLoss()
update_target = dqn.algorithms.HardUpdate()
alpha_scheduler = dqn.annealing_schedules.Constant(0.7)
beta_scheduler = dqn.annealing_schedules.Constant(0.5)
memory = dqn.experience_replay.Proportional(capacity=100000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler)

agent = DQNAgent(network=q_func,
                 observation_space=env.observation_space,
                 action_space=env.action_space,
                 action_selection=action_selection,
                 loss=loss,
                 update_target=update_target,
                 memory=memory,
                 n_step=3,
                 update_target_network_frequency=100)

agent.load('save/qr_dqn')
agent.run(env, num_timesteps=num_steps, render=True)
Esempio n. 15
0
                            realtime_mode=False, config=config)
    env.seed(random_seed)

    # Run with specific wrappers #
    # This is the only Wrapper we used, as the others were didn't add enough value
    env = PyTorchFrame(env)
    # env = FrameStack(env, 3)
    # env = HumanActionEnv(env)

    # Create Agent to Train
    replay_buffer = ReplayBuffer(int(5e3))
    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=True,
        lr=args.lr,
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"],
    )

    # If we have pretrained weights, load them
    if(args.checkpoint):
        print(f"Loading a policy - { args.checkpoint } ")
        agent.policy_network.load_state_dict(torch.load(args.checkpoint))

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    step_count = 0
    state = env.reset()
    for t in range(hyper_params["num-steps"]):
Esempio n. 16
0
import gym
import numpy as np

from dqn.agent import DQNAgent
from dqn.agent import EPISODES, EPISODE_LENGTH, BATCH_SIZE

environment_name = 'CartPole-v1'
environment = gym.make(environment_name)
environment.max_episode_steps = EPISODE_LENGTH

n_actions = environment.action_space.n
n_state_features = environment.observation_space.shape[0]

# Initialize DQN agent
agent = DQNAgent(n_state_features, n_actions)

for episode in range(EPISODES):

    state = environment.reset()
    state = np.reshape(state, [1, n_state_features])

    for t in range(EPISODE_LENGTH):

        # Predict next action using NN Value Function Approximation
        action = agent.get_action(state)

        # Interact with the environment and observe new state and reward
        next_state, reward, terminated, info = environment.step(action)

        # Huge negative reward if failed
        if terminated:
Esempio n. 17
0
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)
    env = gym.wrappers.Monitor(
        env,
        './video/',
        video_callable=lambda episode_id: episode_id % 50 == 0,
        force=True)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params['learning-rate'],
        batch_size=hyper_params['batch-size'],
        gamma=hyper_params['discount-factor'],
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        dqn_type=hyper_params["dqn_type"])

    if (args.load_checkpoint_file):
        print(f"Loading a policy - { args.load_checkpoint_file } ")
        agent.policy_network.load_state_dict(
            torch.load(args.load_checkpoint_file))

    eps_timesteps = hyper_params["eps-fraction"] * \
        float(hyper_params["num-steps"])
    episode_rewards = [0.0]

    state = env.reset()
Esempio n. 18
0
# helper method for reshaping the cartpole observation
def reshape(state):
    return np.reshape(state, [1, 4])


if __name__ == '__main__':
    tf.compat.v1.disable_eager_execution()
    max_score = 0

    n_episodes = 5000
    max_env_steps = 1000

    env = gym.make('CartPole-v0')
    agent = DQNAgent(env=env,
                     net=NN(alpha=0.001, decay=0.0001),
                     memory=ReplayMemory(size=100000))

    if max_env_steps is not None:
        env._max_episode_steps = max_env_steps

    for e in range(n_episodes):
        # reset the env
        state = reshape(env.reset())
        done = False
        score = 0
        # play until env done
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            # env.render()
Esempio n. 19
0
                                type=int)

    args, unknowns = cmdline_parser.parse_known_args()

    history_length = args.history_length
    num_actions = args.num_actions

    Q = CNN(state_dim, num_actions, history_length, hidden=256, lr=1e-3)
    Q_target = CNNTargetNetwork(state_dim,
                                num_actions,
                                history_length,
                                hidden=256,
                                lr=1e-3)
    agent = DQNAgent(Q,
                     Q_target,
                     num_actions,
                     discount_factor=0.99,
                     batch_size=64,
                     epsilon=0.05)
    agent.load("./models_carracing/dqn_agent.ckpt")

    n_test_episodes = 15

    episode_rewards = []
    for i in range(n_test_episodes):
        stats = run_episode(env,
                            agent,
                            deterministic=True,
                            do_training=False,
                            rendering=True)
        episode_rewards.append(stats.episode_reward)
Esempio n. 20
0
                                                  end=0.02,
                                                  num_steps=31 / 32 *
                                                  num_steps)
action_selection = dqn.algorithms.GaussianRandomProcess(stddev_scheduler)
loss = dqn.algorithms.NAFLoss(
)  #TODO: ADD IN ALL OPTIONS HERE AND IN OTHER ONES
update_target = dqn.algorithms.SoftUpdate(tau=0.001)
alpha_scheduler = dqn.annealing_schedules.Constant(0.7)
beta_scheduler = dqn.annealing_schedules.Constant(0.5)
memory = dqn.experience_replay.Proportional(capacity=1000000,
                                            alpha_scheduler=alpha_scheduler,
                                            beta_scheduler=beta_scheduler)

agent = DQNAgent(network=q_func,
                 observation_space=env.observation_space,
                 action_space=env.action_space,
                 action_selection=action_selection,
                 loss=loss,
                 update_target=update_target,
                 memory=memory,
                 n_step=1,
                 batch_size=100,
                 discount_factor=0.99,
                 replay_period=1,
                 replays_per_step=5,
                 update_with_replay=True,
                 update_target_network_frequency=1)

agent.train(env, num_timesteps=num_steps, render=False)
##agent.save('/tmp/save_data_new/naf')
Esempio n. 21
0
    env.seed(hyper_params['seed'])
    #env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 3)

    replay_buffer = ReplayBuffer(hyper_params['replay_buffer_size'])

    agent = DQNAgent(env.observation_space,
                     env.action_space,
                     replay_buffer,
                     use_double_dqn=hyper_params['use_double_dqn'],
                     lr=hyper_params['learning_rate'],
                     batch_size=hyper_params['batch_size'],
                     gamma=hyper_params['discount_factor'])

    eps_timesteps = hyper_params['eps_fraction'] * float(
        hyper_params['num_steps'])
    episode_rewards = [0.0]
    loss = [0.0]
    policy_actions = unpickle_object('action_map')

    state = env.reset()
    for t in range(hyper_params['num_steps']):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params['eps_start'] + fraction * (
            hyper_params['eps_end'] - hyper_params['eps_start'])
Esempio n. 22
0
import gym
import numpy as np

from dqn.agent import DQNAgent
from dqn.agent import EPISODES, EPISODE_LENGTH

environment_name = 'CartPole-v1'
environment = gym.make(environment_name)
environment.max_episode_steps = EPISODE_LENGTH

n_actions = environment.action_space.n
n_state_features = environment.observation_space.shape[0]

# Initialize DQN agent
agent = DQNAgent(n_state_features, n_actions, epsilon=0.0)

# Load pre-trained agent
agent.load(f'./models/{environment_name}.h5')

for episode in range(EPISODES):

    state = environment.reset()
    state = np.reshape(state, [1, n_state_features])

    for t in range(EPISODE_LENGTH):

        # Visualize environment
        environment.render()

        # Predict next action using NN Value Function Approximation
        action = agent.get_action(state)