def test_sarsa():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    policy = EpsGreedyQPolicy(eps=.1)
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
    sarsa.compile(Adam(lr=1e-3))

    sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
    policy.eps = 0.
    h = sarsa.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
def test_duel_dqn():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    memory = SequentialMemory(limit=1000, window_length=1)
    policy = EpsGreedyQPolicy(eps=.1)
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
                   target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True)
    dqn.compile(Adam(lr=1e-3))

    dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
    policy.eps = 0.
    h = dqn.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
Example #3
0
def get_dqn_agent(side=1.0):
    '''
	prepare a fresh agent
	'''

    processor = DummyProcessor()
    input_shape = (3, 3)

    input_x = Input(shape=(1, ) + input_shape)

    # When instantiating agent network, multiply board
    # by -1 or +1 depending on which side agent is playing.
    # This allows agent to otherwise be ambivalent to side.
    intput_x_sidenorm = Lambda(lambda x: x * side)(input_x)

    input_x_flat = Flatten()(intput_x_sidenorm)
    x = Dense(200)(input_x_flat)
    x = Activation('relu')(x)
    x = Dense(40)(x)
    x = Activation('relu')(x)
    x = keras.layers.concatenate([x, input_x_flat, input_x_flat])  # highway
    x = Dense(env.action_space.n)(x)
    predictions = Activation('linear')(x)
    model = keras.models.Model(inputs=input_x, outputs=predictions)

    print(model.summary())

    # see https://github.com/keras-rl/keras-rl/blob/master/examples/duel_dqn_cartpole.py
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = EpsGreedyQPolicy(0.005)
    dqn = DQNAgent(model=model,
                   nb_actions=env.action_space.n,
                   memory=memory,
                   nb_steps_warmup=1000,
                   target_model_update=1000,
                   policy=policy,
                   enable_double_dqn=True)
    dqn.compile(Adam(lr=1e-4), metrics=['mae'])
    return dqn
Example #4
0
def build_dqn(hps, input_dim):
    """Create a DQN agent to be used on lund inputs."""

    print('[+] Constructing DQN agent, model setup:')
    pprint.pprint(hps)

    # set up the DQN agent
    model = build_model(hps, input_dim)
    memory = SequentialMemory(limit=500000, window_length=1)
    if hps["policy"] == "boltzmann":
        policy = BoltzmannQPolicy()
    elif hps["policy"] == "epsgreedyq":
        policy = EpsGreedyQPolicy()
    else:
        raise ValueError("Invalid policy: %s" % hps["policy"])
    duelnet = hps["enable_dueling_network"]
    doubdqn = hps["enable_double_dqn"]
    agent = DQNAgentGroom(model=model,
                          nb_actions=2,
                          enable_dueling_network=duelnet,
                          enable_double_dqn=doubdqn,
                          memory=memory,
                          nb_steps_warmup=500,
                          target_model_update=1e-2,
                          policy=policy)

    if hps['optimizer'] == 'Adam':
        opt = Adam(lr=hps['learning_rate'])
    elif hps['optimizer'] == 'SGD':
        opt = SGD(lr=hps['learning_rate'])
    elif hps['optimizer'] == 'RMSprop':
        opt = RMSprop(lr=hps['learning_rate'])
    elif hps['optimizer'] == 'Adagrad':
        opt = Adagrad(lr=hps['learning_rate'])

    agent.compile(opt, metrics=['mae'])

    return agent
Example #5
0
def main():
    ENV_NAME = 'BreakoutDeterministic-v4'
    INPUT_SHAPE = (84, 84)
    WINDOW_LENGTH = 4
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(42)
    env.seed(42)
    num_actions = env.action_space.n

    model = build_model(INPUT_SHAPE, num_actions)
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = AtariProcessor()
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.1,
                                  value_test=.05,
                                  nb_steps=1000000)

    dqn = DQNAgent(model=model,
                   nb_actions=num_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)
    dqn.compile(Adam(lr=.00025), metrics=['mae'])
    callbacks = build_callbacks(ENV_NAME)

    # After training is done, we save the final weights.
    dqn.load_weights('dqn_BreakoutDeterministic-v4_weights_1750000.h5f')

    # Finally, evaluate our algorithm for 5 episodes.
    dqn.test(env, nb_episodes=10, visualize=True)
Example #6
0
    def __init__(self, observation_shape, nb_actions, eps_steps):
        # First, we build a very simple NN model.

        model = Sequential()
        model.add(Flatten(input_shape=(1, ) + observation_shape))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(16))
        model.add(Activation("relu"))
        model.add(Dense(nb_actions))
        model.add(Activation("linear"))
        print(model.summary())

        # Next, we configure and compile our agent. You can use every
        # built-in Keras optimizer and even the metrics!
        memory = SequentialMemory(limit=50000, window_length=1)

        # policy = BoltzmannQPolicy()
        policy = LinearAnnealedPolicy(
            EpsGreedyQPolicy(),
            attr="eps",
            value_max=1.0,
            value_min=0.1,
            value_test=0.05,
            nb_steps=eps_steps,
        )
        self.dqn = DQNAgent(
            model=model,
            nb_actions=nb_actions,
            memory=memory,
            nb_steps_warmup=1000,
            target_model_update=1e-2,
            policy=policy,
        )

        self.dqn.compile(Adam(lr=1e-3), metrics=["mae"])
    def initiate_agent(self, env):
        """initiate a deep Q agent"""
        tf.compat.v1.disable_eager_execution()

        self.env = env

        nb_actions = self.env.action_space.n

        self.model = Sequential()
        self.model.add(
            Dense(512, activation='relu', input_shape=env.observation_space))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(512, activation='relu'))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(512, activation='relu'))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(nb_actions, activation='linear'))

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        memory = SequentialMemory(limit=memory_limit,
                                  window_length=window_length)
        policy = EpsGreedyQPolicy()

        nb_actions = env.action_space.n

        self.dqn = DQNAgent(model=self.model,
                            nb_actions=nb_actions,
                            memory=memory,
                            nb_steps_warmup=nb_steps_warmup,
                            target_model_update=1e-2,
                            policy=policy,
                            processor=CustomProcessor(),
                            batch_size=batch_size,
                            train_interval=train_interval,
                            enable_double_dqn=enable_double_dqn,
                            enable_dueling_network=enable_dueling_network)
        self.dqn.compile(Adam(lr=1e-3), metrics=['mae'])
Example #8
0
def bootstrapped_train(env):
    model = default_model(env)
    # model = lstm_model(env)

    policy = EpsGreedyQPolicy(eps=0.1)
    memory = SequentialMemory(limit=100000, window_length=1)
    dqn = DQNAgent(
        model=model,
        nb_actions=env.action_space.n,
        memory=memory,
        nb_steps_warmup=10,
        target_model_update=1e-2,
        policy=policy,
    )
    dqn.compile(Adam(lr=1e-3), metrics=["mae"])

    print(model.summary())
    dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)

    env.reset()
    dqn.test(env, nb_episodes=5, visualize=True)

    env.close()
Example #9
0
def dqn_learn(env,
              n_episodes,
              alpha,
              verbose=False,
              render=False,
              save_model=False,
              **kwargs):
    n_actions = env.action_space.n

    # defines NN architecture
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(500))
    model.add(Activation('relu'))
    model.add(Dense(200))
    model.add(Activation('relu'))
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dense(n_actions))
    model.add(Activation('linear'))

    memory = SequentialMemory(limit=1000000, window_length=1)

    policy = EpsGreedyQPolicy()

    dqn = DQNAgent(model=model,
                   nb_actions=n_actions,
                   policy=policy,
                   memory=memory,
                   nb_steps_warmup=30000,
                   target_model_update=1e-2)

    dqn.compile(Adam(lr=alpha), metrics=['mae'])

    dqn.fit(env, nb_steps=n_episodes, log_interval=1e5)

    dqn.save_weights(SAVED_MODEL_PATH, overwrite=save_model)
Example #10
0
def make_deep_q_network(env, args):
    model = KerasModelBuilder(input_shape=args["input_shape"],
                              input_window_length=args["input_window_length"],
                              action_number=env.action_space.n,
                              hidden_layer_size=args["hidden_layer_size"],
                              random_seed=args["random_seed"]).build()

    memory = SequentialMemory(limit=args["replay_memory_size"],
                              window_length=args["input_window_length"])
    processor = AtariProcessor(args["input_shape"])

    policy = CheckpointAnnealedPolicy(EpsGreedyQPolicy(),
                                      attr='eps',
                                      value_max1=args["starting_epslon"],
                                      value_min1=args["annealed_epslon1"],
                                      value_max2=args["annealed_epslon1"],
                                      value_min2=args["annealed_epslon2"],
                                      value_test=args["annealed_epslon2"],
                                      nb_steps1=args["annealed_steps1"],
                                      nb_steps2=args["annealed_steps2"],
                                      starting_step=args["starting_step"])

    dqn = DQNAgent(model=model,
                   nb_actions=env.action_space.n,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=args["replay_memory_starting_size"],
                   gamma=args["discount_factor"],
                   target_model_update=args["target_update_frequency"],
                   enable_dueling_network=args["dueling"],
                   enable_double_dqn=args["double_dqn"],
                   train_interval=args["gradient_update_frequency"],
                   delta_clip=1.)

    dqn.compile(Adam(lr=args["learning_rate"]), metrics=['mae'])
    return dqn
Example #11
0
def dyna_train(cfg, nb_actions, ml_model, model_truncated, sequence_length,
               hstate_size, processor):
    env2 = SynthEnv(ml_model, model_truncated, cfg.env, processor,
                    sequence_length, cfg.WINDOW_LENGTH)

    hidden_in = Input(shape=(1, hstate_size), name='hidden_input')
    hidden_in_f = Flatten(name='flat_hidden')(hidden_in)
    dense_out = Dense(512, activation='relu')(hidden_in_f)
    q_out = Dense(nb_actions, activation='linear')(dense_out)
    model2 = Model(inputs=[hidden_in], outputs=[q_out])
    print(model2.summary())

    memory2 = SequentialMemory(limit=cfg.memory_limit, window_length=1)
    policy2 = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                   attr='eps',
                                   value_max=1.,
                                   value_min=.1,
                                   value_test=.05,
                                   nb_steps=cfg.nb_steps_annealed_policy)
    dqn2 = DQNAgent(model=model2,
                    nb_actions=nb_actions,
                    policy=policy2,
                    memory=memory2,
                    nb_steps_warmup=cfg.nb_steps_warmup_dqn_agent,
                    gamma=.99,
                    target_model_update=cfg.target_model_update_dqn_agent,
                    train_interval=4,
                    delta_clip=1.)
    dqn2.compile(Adam(lr=.00025), metrics=['mae'])
    '''dyna_weights_filename = 'dyna_dqn_{}_weights.h5f'.format(env_name)
    dyna_checkpoint_weights_filename = 'dyna_dqn_' + env_name + '_weights_{step}.h5f'
    dyna_log_filename = 'dyna_dqn_{}_log.json'.format(env_name)
    dyna_callbacks = [ModelIntervalCheckpoint(dyna_checkpoint_weights_filename, interval=250000)]
    dyna_callbacks += [FileLogger(dyna_log_filename, interval=100)]'''
    dqn2.fit(env2, nb_steps=cfg.nb_steps_dqn_fit,
             log_interval=10000)  # callbacks=dyna_callbacks,
    return dqn2
Example #12
0
    def __init__(self, stock: str):
        self.env = gym.make('stockenv-v0', df=read_daily_data(stock))

        print(self.env)
        print(self.env.action_space)
        print(self.env.observation_space)

        self.env.seed(123)
        self.stock = stock

        memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
        policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                      attr='eps',
                                      value_max=1.,
                                      value_min=.1,
                                      value_test=.05,
                                      nb_steps=1000000)

        processor = StockProcessor(stock)
        model = self.create_model(30)
        print("output:", model.output.shape)
        print("output2:", self.env.action_space.shape)

        print(list(model.output.shape))
        print(list((None, self.env.action_space.shape)))

        self.dqn = DQNAgent(model=model,
                            nb_actions=self.env.action_space.n,
                            policy=policy,
                            memory=memory,
                            processor=processor,
                            nb_steps_warmup=50000,
                            gamma=.99,
                            target_model_update=10000,
                            train_interval=4,
                            delta_clip=1.)
        self.dqn.compile(Adam(lr=.00025), metrics=['mae'])
Example #13
0
def train():
    env = gym.make('CartPole-v0')

    model = model_gen(env)

    memory = SequentialMemory(limit=50000, window_length=1)  # memory replay

    # epsilon greedy algorithm
    policy = EpsGreedyQPolicy(eps=0.001)

    dqn = DQNAgent(model=model,
                   nb_actions=env.action_space.n,
                   gamma=0.99,
                   memory=memory,
                   nb_steps_warmup=10,
                   target_model_update=1e-2,
                   policy=policy)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

    with open('data/cartpole_history.json', 'w') as f:
        json.dump(history.history, f)

    dqn.save_weights('data/cartpole_dqn.hdf5')
    def __init__(self, model, policy=EpsGreedyQPolicy(), enable_double_dqn=True,
                 target_model=None, policy_model=None,
                 nb_max_steps_recurrent_unrolling=100, *args, **kwargs):
        super(RecurrentDQNAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__len__') and len(model.output) > 1:
            raise ValueError('Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model))
        if model.output._keras_shape[-1] != self.nb_actions:
            raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions))

        # Validate settings for recurrent DQN.
        self.is_recurrent = True
        if self.is_recurrent:
            if enable_double_dqn:
                raise ValueError('DoubleDQN (`enable_double_dqn = True`) is currently not supported for recurrent Q learning.')
            memory = kwargs['memory']
            if not memory.is_episodic:
                raise ValueError('Recurrent Q learning requires an episodic memory. You are trying to use it with memory={} instead.'.format(memory))
            if nb_max_steps_recurrent_unrolling and not model.stateful:
                raise ValueError('Recurrent Q learning with max. unrolling requires a stateful model.')
            if policy_model is None or not policy_model.stateful:
                raise ValueError('Recurrent Q learning requires a separate stateful policy model with batch_size=1. Please refer to an example to see how to properly set it up.')

        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.nb_max_steps_recurrent_unrolling = nb_max_steps_recurrent_unrolling

        # Related objects.
        self.model = model
        self.target_model = target_model
        self.policy_model = policy_model if policy_model is not None else model
        self.policy = policy

        # State.
        self.reset_states()
    def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10,
                 train_interval=1, delta_clip=np.inf, *args, **kwargs):
        super(SarsaAgent, self).__init__(*args, **kwargs)

        # Do not use defaults in constructor because that would mean that each instance shares the same
        # policy.
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()

        self.model = model
        self.nb_actions = nb_actions
        self.policy = policy
        self.test_policy = policy
        self.gamma = gamma
        self.nb_steps_warmup = nb_steps_warmup
        self.train_interval = train_interval

        self.delta_clip = delta_clip
        self.compiled = False
        self.actions = None
        self.observations = None
        self.rewards = None
Example #16
0
def main(env_name, nb_steps):
    # Get the environment and extract the number of actions.
    env = gym.make(env_name)
    np.random.seed(123)
    env.seed(123)

    nb_actions = env.action_space.n
    input_shape = (1,) + env.observation_space.shape
    model = create_nn_model(input_shape, nb_actions)

    # Finally, we configure and compile our agent.
    memory = EpisodeParameterMemory(limit=10000, window_length=1)

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.,
                                  value_min=.1, value_test=.05,
                                  nb_steps=1000000)
    agent = DQNAgent(model=model, nb_actions=nb_actions, policy=policy,
                     memory=memory, nb_steps_warmup=1000000,
                     gamma=.99, target_model_update=1000,
                     train_interval=4, delta_clip=1.)
    agent.compile(Adam(lr=.00025), metrics=['mae'])
    agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1)

    # After training is done, we save the best weights.
    agent.save_weights(f'dqn_{env_name}_params.h5f', overwrite=True)

    # Finally, evaluate the agent
    history = agent.test(env, nb_episodes=100, visualize=False)
    rewards = np.array(history.history['episode_reward'])
    print(("Test rewards (#episodes={}): mean={:>5.2f}, std={:>5.2f}, "
           "min={:>5.2f}, max={:>5.2f}")
          .format(len(rewards),
                  rewards.mean(),
                  rewards.std(),
                  rewards.min(),
                  rewards.max()))
Example #17
0
def main():
    ENV_NAME = 'LunarLander-v2'
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(42)
    env.seed(42)
    num_actions = env.action_space.n
    state_space = env.observation_space.shape[0]
    print(num_actions)

    model = build_model(state_space, num_actions)

    memory = SequentialMemory(limit=50000, window_length=1)

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.1,
                                  value_test=.05,
                                  nb_steps=10000)

    dqn = DQNAgent(model=model,
                   nb_actions=num_actions,
                   memory=memory,
                   nb_steps_warmup=10,
                   target_model_update=1e-2,
                   policy=policy)
    dqn.compile(Adam(lr=1e-3), metrics=['mae'])

    callbacks = build_callbacks(ENV_NAME)

    # After training is done, we save the final weights.
    dqn.load_weights('dqn_LunarLander-v2_weights_510000.h5f')

    # Finally, evaluate our algorithm for 5 episodes.
    dqn.test(env, nb_episodes=10, visualize=True)
def main():

    # Get the environment and extract the number of actions.
    environment_name = "FlappyBird-v0"
    environment = gym.make(environment_name)
    np.random.seed(666)
    nb_actions = environment.action_space.n

    # Build the model.
    model = build_model((WINDOW_LENGTH, ) + INPUT_SHAPE, nb_actions)
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = FlappyBirdProcessor()

    # Select a policy. We use eps-greedy action selection, which means that a random action is selected
    # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
    # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
    # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
    # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.,
                                  value_min=.1,
                                  value_test=.05,
                                  nb_steps=1000000)

    # The trade-off between exploration and exploitation is difficult and an on-going research topic.
    # If you want, you can experiment with the parameters or use a different policy. Another popular one
    # is Boltzmann-style exploration:
    # policy = BoltzmannQPolicy(tau=1.)
    # Feel free to give it a try!

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)
    dqn.compile(optimizers.Adam(lr=.00025), metrics=['mae'])

    weights_filename = 'dqn_{}_weights.h5f'.format(environment_name)

    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
    checkpoint_weights_filename = 'dqn_' + environment_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(environment_name)
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
    ]
    callbacks += [TensorboardCallback()]
    callbacks += [FileLogger(log_filename, interval=100)]
    dqn.fit(environment,
            callbacks=callbacks,
            nb_steps=1750000,
            log_interval=10000)

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)

    # Finally, evaluate our algorithm for 10 episodes.
    dqn.test(environment, nb_episodes=10, visualize=False)
Example #19
0
def main(model_name, options):

    # Initialize maze environments.
    env = gym.make('Pong-v0')
    #env = gym.make('Taxi-v2')

    envs = [env]

    # Setting hyperparameters.
    nb_actions = env.action_space.n
    maze_dim = (6400, 1)
    h_size = 64  # For DQN
    e_t_size = 64  #For MQN / RMQN
    context_size = 64
    nb_steps_warmup = int(1e5)
    nb_steps = int(4e5)
    buffer_size = 8e4
    learning_rate = 0.003
    target_model_update = 0.999
    clipnorm = 10.
    switch_rate = 50
    window_length = 12
    memory_size = None

    # Callbacks
    log = TrainEpisodeLogger()
    #tensorboard = TensorBoard(log_dir="./logs/{}".format(model_name))
    rl_tensorboard = RLTensorBoard(log_dir="./logs/{}".format(model_name),
                                   histogram_freq=100)

    callbacks = [log, rl_tensorboard]

    ### Models ###
    model = None
    target_model = None

    # MQN model.
    if "MQN" in options:
        memory_size = 12
        model = MQNmodel(e_t_size, context_size, memory_size, window_length,
                         nb_actions, maze_dim)
        target_model = MQNmodel(e_t_size, context_size, memory_size,
                                window_length, nb_actions, maze_dim)

    # RMQN model.
    if "RMQN" in options:
        memory_size = 12
        model = RMQNmodel(e_t_size, context_size, memory_size, window_length,
                          nb_actions, maze_dim)
        target_model = RMQNmodel(e_t_size, context_size, memory_size,
                                 window_length, nb_actions, maze_dim)

    # Distributional MQN model.
    nb_atoms = 51
    v_min = -2.
    v_max = 2.
    #model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions)
    #target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions)

    # DQN model
    if "DQN" in options:
        model = DQNmodel(nb_actions, window_length, h_size, maze_dim)
        target_model = DQNmodel(nb_actions, window_length, h_size, maze_dim)

    # Initialize our target model with the same weights as our model.
    target_model.set_weights(model.get_weights())

    # Initialize memory buffer for DQN algorithm.
    experience = [
        SequentialMemory(limit=int(buffer_size / len(envs)),
                         window_length=window_length) for i in range(len(envs))
    ]

    # Learning policy where we initially begin training our agent by making random moves
    # with a probability of 1, and linearly decrease that probability down to 0.1 over the
    # course of some arbitrary number of steps. (nb_steps)
    policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1.0,
                                  value_min=0.1,
                                  value_test=0.,
                                  nb_steps=1e5)

    # Optional processor.
    processor = PongProcessor()
    # processor = MazeProcessor()

    # Initialize and compile the DQN agent.

    dqn = DQNAgent(model=model,
                   target_model=target_model,
                   nb_actions=nb_actions,
                   memory=experience,
                   nb_steps_warmup=nb_steps_warmup,
                   target_model_update=target_model_update,
                   policy=policy,
                   processor=processor,
                   batch_size=8)

    #Initialize experimental Distributional DQN Agent
    '''
    dqn = DistributionalDQNAgent(
        model=model,
        target_model=target_model,
        num_atoms=nb_atoms,
        v_min=v_min,
        v_max=v_max,
        nb_actions=nb_actions,
        memory=experience,
        nb_steps_warmup=nb_steps_warmup,
        target_model_update=target_model_update,
        policy=policy,
        #processor=processor,
        batch_size=32
    )
    '''

    # Compile the agent to check for validity, build tensorflow graph, etc.
    dqn.compile(RMSprop(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"])

    # Weights will be loaded if weight file exists.
    if os.path.exists("data/{}/{}".format(model_name, model_name + ".h5")):
        dqn.load_weights("data/{}/{}".format(model_name, model_name + ".h5"))

    # Train DQN in environment.
    if "train" in options:
        dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks)

        # Visualization / Logging Tools
        logmetrics(log, model_name)
        logHyperparameters(model_name,
                           e_t_size=e_t_size,
                           context_size=context_size,
                           h_size=h_size,
                           memory_size=memory_size,
                           learning_rate=learning_rate,
                           target_model_update=target_model_update,
                           clipnorm=clipnorm,
                           window_length=window_length,
                           nb_atoms=nb_atoms,
                           v_min=v_min,
                           v_max=v_max)

        # Save weights.
        dqn.save_weights("data/{}/{}".format(model_name, model_name + ".h5"))

    # Test DQN in environment.
    if "test" in options:
        dqn.test(env, nb_episodes=100, visualize=True)

    #Debugging
    if "debug" in options:
        observation = env.reset()
        outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0))
        #visualizeLayer(dqn.model, dqn.layers[1], observation)

    return
Example #20
0
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
Example #21
0
model.add(Dense(7))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# log all metrics to TensorBoard
tb_callback = TensorBoard(log_dir='./logs/s4a11r5/run3')
test_callback = TestCallback(tb_callback)
callbacks = [
    tb_callback,
    test_callback
]

memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy(eps=0.2)
# also tried LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.01, value_test=.05, nb_steps=100000)
dqn = OriginalDQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, target_model_update=1e-2, gamma=0.999)
# other prams ever tried: target_model_update = 10000 (steps), gamma = 0.9
dqn.compile(Adam(lr=.008), metrics=['mae'])
# also tried lr = 0.01, 0.001, 0.0001
dqn.fit(env, nb_steps=840000, visualize=True, verbose=2, callbacks=callbacks) # main function of training

"""
This is the main process of dqn.fit function:
while step < nb_steps:
    call env.reset() to get initial state
    compute action in dqn.forward()
    call env.step() to execute action and get tuple(s',a,r,done,info)
    call dqn.backward() to train NN and get the metrics
"""
# sonst neuen Speicher erstellen
except:
    memory = SequentialMemory(limit=MEMORY_SIZE, window_length=STATES)

# Versuche Agent zu laden
try:
    agent_ame = fnmatch.filter(os.listdir(daten_pfad), '*_agent.pkl')[-1]
    nb_training_steps = int(agent_ame.split("_")[0])
    NUM_STEPS = NUM_STEPS - nb_training_steps  # Anz. Verbleibende Steps
    agent_filepath = daten_pfad + '/' + agent_ame
    dqn = pickle.load(open(agent_filepath, "rb"))

# sonst erstelle neuen Agent
except:
    TRAIN_POLICY = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                        attr='eps',
                                        value_max=0.05,
                                        value_min=0.05,
                                        value_test=0.01,
                                        nb_steps=NUM_STEPS_ANNEALED)
    TEST_POLICY = EpsGreedyQPolicy(eps=.01)
    dqn = DQNAgent(model=model,
                   nb_actions=NUM_ACTIONS_PRO_AGENT,
                   test_policy=TEST_POLICY,
                   policy=TRAIN_POLICY,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=NUM_STEPS_WARMUP,
                   gamma=.99,
                   target_model_update=TARGET_MODEL_UPDATE,
                           data_format="channels_last")(layer0)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu",
                           data_format="channels_last")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu",
                           data_format="channels_last")(layer2)
    layer4 = layers.Flatten()(layer3)
    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(actions, activation="linear")(layer5)

    return K.Model(inputs=inputs, outputs=action)


if __name__ == '__main__':
    env = gym.make('BreakoutNoFrameskip-v4')
    state = env.reset()
    actions = env.action_space.n
    model = create_q_model(actions)
    memory = SequentialMemory(limit=1000000, window_length=4)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                                  value_max=1., value_min=.1,
                                  value_test=.05, nb_steps=850000)
    process = AtariProcessor()
    dqn = DQNAgent(model=model, nb_actions=actions, memory=memory,
                   nb_steps_warmup=50000, target_model_update=10000,
                   policy=policy, processor=process, train_interval=4,
                   gamma=.99, delta_clip=1.)
    dqn.compile(optimizer=Adam(lr=0.00025), metrics=['mae', 'accuracy'])
    callback = [ModelIntervalCheck('policy.h5', 1000, 1, model)]
    dqn.fit(env, nb_steps=1750000, callbacks=callback, visualize=True)
    model.save("policy.h5")
Example #24
0
    def train(
        self,
        env,
        input_fn,
        max_steps=10000,
        policy=EpsGreedyQPolicy(),
        memory=SequentialMemory(limit=1000, window_length=1),
        target_model_update=10000,
        gamma=0.99,
        warmup_steps=None,
        batch_size=64,
        summary_steps=100,
        save_steps=10000,
        visualize=False,
        seed=None,
    ):

        min_memory = max(
            warmup_steps,
            batch_size) if warmup_steps is not None else batch_size

        with tf.Graph().as_default() as graph:

            #####################
            # config
            #####################

            if seed is not None:
                tf.set_random_seed(seed)

            global_step = tf.train.get_or_create_global_step()

            #####################
            # inputs
            #####################

            inputs = input_fn()
            state0_t, reward_t, terminal_t, action_t, state1_t = [
                inputs[x]
                for x in ["state0", "reward", "terminal", "action", "state1"]
            ]

            print(inputs)

            #####################
            # model_fn
            #####################

            with tf.variable_scope("Model") as model_scope:
                model_inputs = dict(state=inputs["state0"])
                model_q_values = self.model_fn(model_inputs,
                                               tf.estimator.ModeKeys.TRAIN,
                                               self.params)

            with tf.variable_scope("Model", reuse=True) as predict_scope:
                model_inputs = dict(state=inputs["state0"])
                predict_q_values = self.model_fn(model_inputs,
                                                 tf.estimator.ModeKeys.PREDICT,
                                                 self.params)

            with tf.variable_scope("TargetModel") as target_scope:
                target_model_inputs = dict(state=inputs["state1"])
                target_q_values = self.model_fn(target_model_inputs,
                                                tf.estimator.ModeKeys.PREDICT,
                                                self.params)

            # get variables
            model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope=model_scope.name)
            target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope=target_scope.name)

            not_terminal = 1.0 - tf.cast(inputs["terminal"], tf.float32)
            action_values = tf.reduce_max(target_q_values, axis=1)
            target_values = inputs[
                "reward"] + gamma * action_values * not_terminal

            assert action_values.get_shape().as_list(
            ) == inputs["reward"].get_shape().as_list()
            assert action_values.get_shape().as_list(
            ) == not_terminal.get_shape().as_list()

            model_action_values = utils.select_columns(model_q_values,
                                                       inputs["action"])

            tf.losses.huber_loss(target_values,
                                 model_action_values,
                                 delta=100.0)
            # loss = tf.reduce_mean(loss)

            loss = tf.losses.get_total_loss()

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.learning_rate)

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                train_op = optimizer.minimize(
                    loss,
                    global_step=tf.train.get_global_step(),
                    var_list=tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope=model_scope.name),
                )

            tf.summary.scalar("target", tf.reduce_mean(target_values))

            #####################
            # train stuff
            #####################

            tf.summary.scalar("loss", loss)

            train_summaries = tf.summary.merge_all()

            if target_model_update >= 1:
                update_target_op = tf.cond(
                    # global_step % target_model_update == 0
                    tf.equal(
                        tf.mod(tf.train.get_global_step(),
                               target_model_update),
                        0,
                    ),
                    lambda: update_target_weights_hard(target_variables,
                                                       model_variables),
                    lambda: tf.no_op(),
                )
            else:
                update_target_op = update_target_weights_soft(
                    target_variables, model_variables, target_model_update)

            final_train_op = tf.group(
                train_op,
                update_target_op,
            )

            #####################
            # episode stuff
            #####################

            episode_length_t = tf.placeholder(tf.int32, name="episode_length")
            episode_reward_t = tf.placeholder(tf.int32, name="episode_reward")

            episode_length_summary = tf.summary.scalar("episode_length",
                                                       episode_length_t)
            episode_reward_summary = tf.summary.scalar("episode_reward",
                                                       episode_reward_t)

            final_episode_op = tf.group()
            episode_summaries = tf.summary.merge(
                [episode_length_summary, episode_reward_summary])

            #####################
            # initializers
            #####################

            global_variables_initializer = tf.global_variables_initializer()

            saver = tf.train.Saver()

        writer = tf.summary.FileWriter(self.model_dir)

        with graph.as_default(), tf.Session(graph=graph) as sess:

            utils.initialize_or_restore(sess, self.model_dir,
                                        global_variables_initializer)
            graph.finalize()

            current_step = sess.run(global_step)

            state0 = env.reset()

            _episode_length = 0
            _episode_reward = 0.0

            for step in range(current_step, max_steps):

                step_feed = {state0_t: [state0]}

                predictions = sess.run(predict_q_values, step_feed)

                action = policy.select_action(q_values=predictions[0])

                state1, reward, terminal, _info = env.step(action)

                if visualize:
                    env.render()

                #
                _episode_length += 1
                _episode_reward += reward
                #

                memory.append(state0, action, reward, terminal)

                train_fetches = {}
                train_feed = {}

                if memory.nb_entries > min_memory:
                    experiences = memory.sample(batch_size)
                    experiences = [list(x) for x in zip(*experiences)]

                    state0_a, action_a, reward_a, state1_a, terminal_a = experiences

                    state0_a = np.squeeze(state0_a)
                    state1_a = np.squeeze(state1_a)

                    train_feed.update({
                        state0_t: state0_a,
                        action_t: action_a,
                        reward_t: reward_a,
                        state1_t: state1_a,
                        terminal_t: terminal_a,
                    })

                    train_fetches["train_op"] = final_train_op

                    if step % summary_steps == 0:
                        train_fetches["train_summaries"] = train_summaries

                    if step % save_steps == 0:
                        checkpoint_path = os.path.join(self.model_dir,
                                                       "model.ckpt")
                        saver.save(sess, checkpoint_path, global_step=step)

                if terminal:
                    train_feed[episode_length_t] = _episode_length
                    train_feed[episode_reward_t] = _episode_reward

                    train_fetches["episode_op"] = final_episode_op
                    train_fetches["episode_summaries"] = episode_summaries

                # do training
                results = sess.run(train_fetches, train_feed)

                if "train_summaries" in results:
                    writer.add_summary(
                        results["train_summaries"],
                        step,
                    )

                if "episode_summaries" in results:
                    writer.add_summary(
                        results["episode_summaries"],
                        step,
                    )

                # end step
                if terminal:
                    state0 = env.reset()
                    #
                    _episode_length = 0
                    _episode_reward = 0.0
                    #

                else:
                    state0 = state1
Example #25
0
        # Pass a Class with extra parameters
        reference_generator=WienerProcessReferenceGenerator(
            reference_state='i', sigma_range=(5e-3, 5e-1)))
    nb_actions = env.action_space.n
    env = FlattenObservation(env)
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(4))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dense(4))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    memory = SequentialMemory(limit=15000, window_length=1)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=0.5), 'eps', 0.5, 0.01,
                                  0, 20000)
    dqn = DQNAgent(model=model,
                   policy=policy,
                   nb_actions=nb_actions,
                   memory=memory,
                   gamma=0.5,
                   batch_size=128,
                   train_interval=1,
                   memory_interval=1)

    dqn.compile(Adam(), metrics=['mse'])
    dqn.fit(env,
            nb_steps=200000,
            action_repetition=5,
            verbose=1,
Example #26
0
    def __init__(self,
                 model,
                 policy=None,
                 test_policy=None,
                 enable_double_dqn=False,
                 enable_dueling_network=False,
                 dueling_type='avg',
                 *args,
                 **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__len__') and len(model.output) > 1:
            raise ValueError(
                'Model "{}" has more than one output. DQN expects a model that has a single output.'
                .format(model))
        if model.output._keras_shape != (None, self.nb_actions):
            raise ValueError(
                'Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'
                .format(model.output, self.nb_actions))

        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type
        if self.enable_dueling_network:
            # get the second last layer of the model, abandon the last layer
            layer = model.layers[-2]
            nb_action = model.output._keras_shape[-1]
            # layer y has a shape (nb_action+1,)
            # y[:,0] represents V(s;theta)
            # y[:,1:] represents A(s,a;theta)
            y = Dense(nb_action + 1, activation='linear')(layer.output)
            # caculate the Q(s,a;theta)
            # dueling_type == 'avg'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            # dueling_type == 'max'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            # dueling_type == 'naive'
            # Q(s,a;theta) = V(s;theta) + A(s,a;theta)
            if self.dueling_type == 'avg':
                outputlayer = Lambda(
                    lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(
                        a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action, ))(y)
            elif self.dueling_type == 'max':
                outputlayer = Lambda(
                    lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(
                        a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action, ))(y)
            elif self.dueling_type == 'naive':
                outputlayer = Lambda(
                    lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:],
                    output_shape=(nb_action, ))(y)
            else:
                assert False, "dueling_type must be one of {'avg','max','naive'}"

            model = Model(inputs=model.input, outputs=outputlayer)

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy

        # State.
        self.reset_states()
Example #27
0
def run():
    """Construct and start the environment."""

    env = JacoEnv(64,
                  64,
                  100,
                  0.1,
                  0.8,
                  True)
    nb_actions = env.real_num_actions # All possible action, where each action is a unit in this vector
    new_floor_color = list((0.55 - 0.45) * np.random.random(3) + 0.45) + [1.]
    new_cube_color = list(np.random.random(3)) + [1.]
    env.change_floor_color(new_floor_color)
    env.change_cube_color(new_cube_color)

    encoder = load_model(WEIGHTS_FILE)
    print("#########################")
    nb_observation_space = (64, 64, 3)
    original_input = Input(shape=(WINDOW_LENGTH,) + nb_observation_space)
    in_layer = [Lambda(lambda x: x[:, i, :, :])(original_input) for i in range(WINDOW_LENGTH)]
    for layer in encoder.layers:
        layer.trainable = False
    print(encoder.summary())
    encoder_output = [encoder(x) for x in in_layer]

    x = Concatenate()(encoder_output)
    x = Dense(512, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(nb_actions, activation='linear')(x)
    model = Model(original_input, [x])
    print(model.summary())
    if MULTI_GPU:
        model = multi_gpu_model(model, gpus=2)
        print(model.summary())

    num_warmup = 50000
    # num_simulated_annealing = 500000 + num_warmup
    # num_warmup = 0
    num_simulated_annealing = 220000 + num_warmup

    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=num_simulated_annealing)

    dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=num_warmup, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)
    dqn.compile(Adam(lr=.00025), metrics=['mae'])

    if False:
        dqn.load_weights("stylegan_dqn_weights")
        checkpoint_callback = ModelCheckpoint("stylegan_dqn_checkpoint", monitor='episode_reward', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period = 10)
        history = dqn.fit(env, nb_steps=num_simulated_annealing + 450000, visualize=False, verbose=1, callbacks=[checkpoint_callback])
        dqn.save_weights("stylegan_dqn_weights")
        np.savez_compressed("stylegan_dqn_history", episode_reward=np.asarray(history.history['episode_reward']))
    else:
        dqn.load_weights("stylegan_dqn_weights")

        print("original domain")
        source_test_losses = dqn.test(env, nb_episodes=100, visualize=True)
        np.savez_compressed("myvae_dqn_source_test",
                            episode_reward=np.asarray(source_test_losses.history['episode_reward']),
                            nb_steps=np.asarray(source_test_losses.history['nb_steps']))

        print("target domain")
        new_floor_color = [0.4, 0.6, 0.4, 1.]
        new_cube_color = [1.0, 0.0, 0.0, 1.]
        env.change_floor_color(new_floor_color)
        env.change_cube_color(new_cube_color)
        target_test_losses = dqn.test(env, nb_episodes=100, visualize=True)
        np.savez_compressed("myvae_dqn_target_test",
                            episode_reward=np.asarray(target_test_losses.history['episode_reward']),
                            nb_steps=np.asarray(target_test_losses.history['nb_steps']))
Example #28
0
    model = Sequential()
    model.add(Dense(128, activation="elu", input_shape=(1, 10)))

    # Our embedding have shape (1, 10), which affects our hidden layer
    # dimension and output dimension
    # Flattening resolve potential issues that would arise otherwise
    model.add(Flatten())
    model.add(Dense(64, activation="elu"))
    model.add(Dense(n_action, activation="linear"))

    memory = SequentialMemory(limit=10000, window_length=1)

    # Ssimple epsilon greedy
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=10000,
    )

    # Defining our DQN
    dqn = DQNAgent(
        model=model,
        nb_actions=len(env_player.action_space),
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.5,
Example #29
0
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
#print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=300000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# After training is done, we save the final weights.
dqn.load_weights('dqn_{}_weights_model3.h5f'.format(ENV_NAME))

# Redirect stdout to capture test results
old_stdout = sys.stdout
sys.stdout = mystdout = io.StringIO()
Example #30
0
 def init_policy(self, policy_dict):
     self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), **policy_dict)
Example #31
0
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
Example #32
0
def main():
    """
    Initialization of all parameters, neural net, agent, training, validation and testing
    """
    write_model_info(
    )  # save in a file the parameters you are using for this model

    # set up Environment and variables
    if METHOD == trailing:
        env = TrailEnv(FOLDER,
                       STEPS,
                       train_data,
                       test_data,
                       TEST_POINTS,
                       val_data=VAL_DATA,
                       val_starts=VAL_STARTS,
                       limit_data=DATA_SIZE,
                       one_hot=ONE_HOT,
                       cost=COST,
                       margin=MARGIN,
                       turn=TURN,
                       ce=CE,
                       dp=DP,
                       normalize_in=NORMALIZE_IN,
                       reset_margin=RESET_FROM_MARGIN)
    else:
        env = DengEnv(FOLDER,
                      STEPS,
                      train_data,
                      test_data,
                      TEST_POINTS,
                      val_data=VAL_DATA,
                      val_starts=VAL_STARTS,
                      window=WINDOW_LENGTH,
                      limit_data=DATA_SIZE,
                      one_hot=ONE_HOT,
                      cost=COST_D)

    # set up the model
    model = set_model(env)

    memory = SequentialMemory(limit=MEM_SIZE, window_length=WINDOW_LENGTH)

    # Exploration policy
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=1.0,
                                  value_min=0.1,
                                  value_test=0.05,
                                  nb_steps=EXPLORE_STEPS)

    nb_actions = env.action_space.n  # set up number of actions (outputs)

    # set up keras-rl agent
    dqn = DQNAgent(model=model,
                   gamma=GAMMA,
                   nb_actions=nb_actions,
                   memory=memory,
                   batch_size=BATCH_SIZE,
                   nb_steps_warmup=1000,
                   target_model_update=TAR_MOD_UP,
                   policy=policy,
                   delta_clip=DELTA_CLIP)

    dqn.compile(Adam(lr=LR, decay=LR_DEC), metrics=['mse'])

    if START_FROM_TRAINED:
        dqn.load_weights(TRAINED_WEIGHTS)

    if VALIDATE:
        train_w_validation(env, dqn)
    else:
        train(env, dqn)

    fin_stats(env, STEPS)
    test(env, dqn)