Example #1
0
def test_fit_observations():
    memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
    agent = TestAgent(memory)
    env = TestEnv()
    agent.compile()
    agent.fit(env, 20, verbose=0)

    # Inspect memory to see if observations are correct.
    experiencies = memory.sample(batch_size=6, batch_idxs=range(2, 8))

    assert experiencies[0].reward == .4
    assert experiencies[0].action == 3
    assert_allclose(experiencies[0].state0, np.array([2, 3]))
    assert_allclose(experiencies[0].state1, np.array([3, 4]))
    assert experiencies[0].terminal1 is False

    assert experiencies[1].reward == .5
    assert experiencies[1].action == 4
    assert_allclose(experiencies[1].state0, np.array([3, 4]))
    assert_allclose(experiencies[1].state1, np.array([4, 5]))
    assert experiencies[1].terminal1 is False

    assert experiencies[2].reward == .6
    assert experiencies[2].action == 5
    assert_allclose(experiencies[2].state0, np.array([4, 5]))
    assert_allclose(experiencies[2].state1, np.array([5, 6]))
    assert experiencies[2].terminal1 is True

    # Experience 3 has been re-sampled since since state0 would be terminal in which case we
    # cannot really have a meaningful transition because the environment gets reset. We thus
    # just ensure that state0 is not terminal.
    assert not np.all(experiencies[3].state0 == np.array([5, 6]))

    assert experiencies[4].reward == .2
    assert experiencies[4].action == 1
    assert_allclose(experiencies[4].state0, np.array([0, 1]))
    assert_allclose(experiencies[4].state1, np.array([1, 2]))
    assert experiencies[4].terminal1 is False

    assert experiencies[5].reward == .3
    assert experiencies[5].action == 2
    assert_allclose(experiencies[5].state0, np.array([1, 2]))
    assert_allclose(experiencies[5].state1, np.array([2, 3]))
    assert experiencies[5].terminal1 is False
Example #2
0
if args.agent_type == 'conv':
    model = get_conv_model(model)
elif args.agent_type == 'rnn':
    model = get_rnn_model(model)
elif args.agent_type == 'drnn':
    model = get_double_rnn_model(model)
elif args.agent_type == 'ntm':
    model = get_ntm_model()

else:
    raise ValueError('unknown model type: {}'.format(args.agent_type))

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.02,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
Example #3
0
env = Environment(dataSetPath=DATA_SET_PATH)
model = Sequential()
model.add(Flatten(input_shape=(WINDOW_SIZE, ) + env.observation_space.shape))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(env.action_space.n))
model.add(Activation('linear'))

# model.summary()

memory = SequentialMemory(limit=env.dataLength, window_length=WINDOW_SIZE)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=env.action_space.n,
               memory=memory,
               nb_steps_warmup=100,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=0.001), metrics=['mae'])

if (os.path.exists(WEIGHTS_NAME)):
    dqn.load_weights(WEIGHTS_NAME)
    print("saved weight loaded")


def getPredictionAt(index=0):
Example #4
0
    if sys.argv[2] == 'train':
        input_shape = (1, ) + env.observation_space.shape

        # DQNのネットワーク定義
        # とりあえずオプションはデフォルト
        model = Sequential()
        model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
        model.add(Dense(512))
        model.add(Dense(512))
        model.add(Dense(nb_actions))
        print(model.summary())

        # experience replay用のmemory
        # 各ステップごと順番に学習させるわけではく、一度メモリに保存してからランダムに抽出と学習するとか
        # 正直、完全には理解できていません
        memory = SequentialMemory(limit=40000, window_length=1)

        # 行動方策はオーソドックスなepsilon-greedyです。
        policy = EpsGreedyQPolicy(eps=0.1)

        # warmup = 文字通り準備運動のイメージ いきなり学習させずにある程度メモリに貯めると思ってる
        # update = 学習率 小さくすると時間がかかるし、高くすると過学習しやすくなる
        dqn = DQNAgent(model=model,
                       nb_actions=nb_actions,
                       memory=memory,
                       nb_steps_warmup=100,
                       target_model_update=1e-2,
                       policy=policy)
        dqn.compile(Adam(lr=0.001))

        # nb_steps = 何ステップ学習させるか 数値をめちゃくちゃ大きくして、一晩経ったらCtrl+Cで止めるとかでも別にいい
Example #5
0
def get_agent(env, agent_id, model=1):
    global observation_size
    # Count number of actions
    if not ingy:
        nb_actions = env.action_space['action_movement'][0].shape[0] + 2
        # Count number of observations for input
        if observation_size == 0:
            observation_size += env.observation_space[
                'observation_self'].shape[0]
            observation_size += env.observation_space['agent_qpos_qvel'].shape[0] * \
                                env.observation_space['agent_qpos_qvel'].shape[1]
            observation_size += env.observation_space['box_obs'].shape[
                0] * env.observation_space['box_obs'].shape[1]
            observation_size += env.observation_space['ramp_obs'].shape[
                0] * env.observation_space['ramp_obs'].shape[1]
            # TODO: Not sure whether to include mask_a*_obs and mask_ab_obs_spoof in this observation input -AH
    else:
        nb_actions = env.action_space.spaces['action_movement'].spaces[
            0].shape[0][0] + 2
        # Count number of observations for input
        if observation_size == 0:
            observation_size += env.observation_space.spaces[
                'observation_self'].shape[0]
            if 'lidar' in env.observation_space.spaces:
                observation_size += env.observation_space.spaces[
                    'lidar'].shape[0]
            observation_size += env.observation_space.spaces['agent_qpos_qvel'].shape[0] * \
                                env.observation_space.spaces['agent_qpos_qvel'].shape[1]
            observation_size += env.observation_space.spaces['box_obs'].shape[0] * \
                                env.observation_space.spaces['box_obs'].shape[1]
            observation_size += env.observation_space.spaces['ramp_obs'].shape[0] * \
                                env.observation_space.spaces['ramp_obs'].shape[1]

    if model == 1:
        # Build the actor model
        actor = Sequential()
        actor.add(Flatten(input_shape=(
            1,
            observation_size,
        )))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('sigmoid'))  # Return values from 0 to 1
        # print(actor.summary())

        # Build the critic model
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(
            1,
            observation_size,
        ),
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        # print(critic.summary())

        # Build the agent
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=2.15,
                                                  mu=0,
                                                  sigma=3)
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=4000,
                          nb_steps_warmup_actor=4000,
                          random_process=random_process,
                          gamma=.9,
                          target_model_update=1e-3,
                          processor=MujocoProcessor(agent_id))
        agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

    elif model == 2:
        # Build the actor model
        actor = Sequential()
        actor.add(Flatten(input_shape=(
            1,
            observation_size,
        )))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Dropout(0.3))
        actor.add(Activation('relu'))
        actor.add(Dense(100))
        actor.add(Dropout(0.2))
        actor.add(Activation('elu'))
        actor.add(Dense(50))
        actor.add(Dropout(0.2))
        actor.add(Activation('elu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('softmax'))  # Return values from 0 to 1
        # print(actor.summary())

        # Build the critic model
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(
            1,
            observation_size,
        ),
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(100)(x)
        x = Activation('elu')(x)
        x = Dropout(0.2)(x)
        x = Dense(50)(x)
        x = Activation('elu')(x)
        x = Dropout(0.2)(x)
        x = Dense(1)(x)
        x = Activation('tanh')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        # print(critic.summary())

        # Build the agent
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=2.8,
                                                  mu=0,
                                                  sigma=3.5)
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=500,
                          nb_steps_warmup_actor=500,
                          random_process=random_process,
                          gamma=.9,
                          target_model_update=5e-2,
                          processor=MujocoProcessor(agent_id))
        agent.compile([Adam(lr=5e-1, decay=0.9),
                       Adam(lr=5e-1, decay=0.9)],
                      metrics=['mae'])

    return agent
Example #6
0
def training_game():
    env = Environment()

    input_shape = (FLAGS.screen_size, FLAGS.screen_size, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.7,
                                  value_test=.0,
                                  nb_steps=1e6)

    # Agent

    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        memory=memory,
        enable_double_dqn=False,
        nb_steps_warmup=500,
        # nb_steps_warmup=1,
        target_model_update=1e-2,
        policy=policy,
        batch_size=150,
        processor=processor)

    dqn.compile(Adam(lr=.001), metrics=["mae"])

    # Tensorboard callback

    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = FLAGS.mini_game
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    dqn.fit(env,
            callbacks=callbacks,
            nb_steps=1e7,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
Example #7
0
model.add(Dense(32))
model.add(Activation('relu'))

model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

nb_steps = 25000
nb_max_episode_steps_06_05_20_49 = 150
nb_max_episode_steps_06_06_16_24 = 100
nb_max_episode_steps_06_06_16_07 = 200
nb_max_episode_steps = nb_max_episode_steps_06_06_16_24

env.opt_reward = nb_max_episode_steps * 2

memory = SequentialMemory(limit=nb_steps, window_length=1)

# 0.1 : 4k, 0.25 : 4k, 0.5 : 7k-inf
policy_06_14_16_20 = BoltzmannGumbelQPolicy(C=20.0)
# more stable
# 0.1 : 4k, 0.25 : 4-5k, 0.5 : 10k-inf
policy_06_13_19_00 = BoltzmannQPolicy(tau=1.0)
policy_06_14_16_15 = MaxBoltzmannQPolicy(eps=0.1)
policy = policy_06_14_16_20

target_model_update_06_05_20_49 = 1e-2
target_model_update_06_05_22_18 = 1e-1
target_model_update_06_13_19_07 = 1e-3
target_model_update = target_model_update_06_05_20_49
bactch_size_06_05_22_18 = 32
bactch_size_07_05_16_07 = 64
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
logger.info(model.summary())

nb_episode_steps = 60
nb_episodes = 400

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=nb_episode_steps * nb_episodes,
                          window_length=1)
#policy = BoltzmannQPolicy()
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env,
        nb_max_episode_steps=nb_episode_steps,
Example #9
0
conv_1 = Conv2D(64, 3, padding='same')(rehsape_layer)
conv1_a = LeakyReLU()(conv_1)
flat_layer = Flatten()(conv1_a)
dense_1 = Dense(512)(flat_layer)
dense_1_a = LeakyReLU()(dense_1)
output_layer = Dense(nb_actions, activation='linear')(dense_1_a)
masked_layer = multiply([output_layer, mask])

model = Model([input_layer, mask], masked_layer)
model.summary()

train_mode = len(sys.argv) > 1 and sys.argv[1] == 'train'

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=MEMORY, window_length=WINDOW_LENGTH)
policy = BoltzmannQPolicy()
processor = SalpakanProcessor(env)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=WARM_UP,
               target_model_update=1e-2,
               policy=policy,
               processor=processor)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

if os.path.isfile(WEIGHTS_PATH) and os.access(WEIGHTS_PATH, os.R_OK):
    dqn.load_weights(WEIGHTS_PATH)
Example #10
0
def main():
    env = PikaEnv()
    nb_actions = env.action_space.n

    model = Sequential()
    model.add(Flatten(input_shape=(4, ) + env.observation_space.shape))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(512))
    model.add(Activation("relu"))
    model.add(Dense(nb_actions))
    model.add(Activation("linear"))
    print(model.summary())
    memory = SequentialMemory(limit=1_000_000, window_length=4)
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0.05,
        nb_steps=nb_steps // 4,
    )
    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        policy=policy,
        memory=memory,
        enable_dueling_network=True,
        enable_double_dqn=False,
    )
    dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    # dqn.load_weights(log_dir + "load.h5f")
    weights_filename = log_dir + "dqn_weights.h5f"
    checkpoint_weights_filename = log_dir + "dqn_weights_{step}.h5f"
    log_filename = log_dir + "dqn_log.json"
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)
    ]
    callbacks += [FileLogger(log_filename, interval=100)]
    tbCallBack = TensorBoard(
        log_dir=tb_dir,
        histogram_freq=0,
        write_graph=True,
        write_grads=True,
        write_images=True,
        embeddings_freq=0,
        embeddings_layer_names=None,
        embeddings_metadata=None,
    )
    callbacks += [tbCallBack]
    dqn.fit(
        env,
        callbacks=callbacks,
        nb_steps=nb_steps,
        log_interval=10,
        visualize=True,
        verbose=2,
    )

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)
Example #11
0
    def build_agent(spec):
        """Defines a Keras-rl agent, ready for training.

        :param spec: a Namespace of agent specification options.
        :return: the rl agent
        """

        env = gym.make(spec.env)
        n_actions = env.action_space.n

        # Define network for Atari games
        if spec.rb_address is None:
            atari_agent = models.AtariAgent(
                env_name=spec.env,
                training=spec.training,
                one_life=not spec.no_onelife,
            )

        # Define network for Atari games + Restraining bolt
        else:
            atari_agent = models.RestrainedAtariAgent(
                env_name=spec.env,
                training=spec.training,
                one_life=not spec.no_onelife,
                frames_sender=streaming.AtariFramesSender(spec.env),
                rb_receiver=streaming.StateRewardReceiver(spec.rb_address),
            )

        # Samples are extracted from memory, not observed directly
        memory = SequentialMemory(limit=spec.memory_limit,
                                  window_length=atari_agent.window_length)

        # Linear dicrease of greedy actions
        train_policy = LinearAnnealedPolicy(
            EpsGreedyQPolicy(),
            attr="eps",
            value_max=spec.random_max,
            value_min=spec.random_min,
            value_test=spec.random_test,
            nb_steps=spec.random_decay_steps,
        )

        # Test policy: constant eps or per-episode
        test_policy = (EpsGreedyQPolicy(
            eps=spec.random_test) if not spec.random_epsilon else
                       models.EpisodeRandomEpsPolicy(min_eps=0.0,
                                                     max_eps=spec.random_test))

        # RL agent
        dqn = DQNAgent(
            model=atari_agent.model,
            enable_double_dqn=True,
            enable_dueling_network=False,
            nb_actions=n_actions,
            policy=train_policy,
            test_policy=test_policy,
            memory=memory,
            processor=atari_agent.processor,
            nb_steps_warmup=spec.steps_warmup,
            gamma=spec.gamma,
            batch_size=spec.batch_size,
            train_interval=spec.train_interval,
            target_model_update=spec.target_update,
            delta_clip=1.0,
            custom_model_objects=atari_agent.custom_layers,
        )
        dqn.compile(optimizer=Adam(lr=spec.learning_rate), metrics=["mae"])

        return dqn, atari_agent
Example #12
0
                    kernel_initializer='zeros',
                    activation='linear')(denses)

model = Model(inputs=[
    rgbimage_input, dimage_input, velocity_input, distance_input,
    geofence_input
],
              outputs=predictions)

train = True

tb = TensorBoard(log_dir='logs')

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)  #reduce memmory

processor = MultiInputProcessor(nb_inputs=5)

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05c
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=0.0,
                              nb_steps=100000)
def build_agent(model,actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=100000,window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions,
                    nb_steps_warmup=20, target_model_update = 1e-2)
    return dqn
flattened_observation = Flatten()(observation_input)  #?
x = concatenate([action_input, flattened_observation])
x = Dense(64)(x)
x = Activation('relu')(x)
x = Dense(400)(x)
x = Activation('relu')(x)
x = Dense(300)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
# not included: L2 weight decay
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Set up the agent for training
memory = SequentialMemory(limit=1000000,
                          window_length=1)  # not same as default
random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                          mu=0.,
                                          sigma=.2,
                                          size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3,
                  delta_clip=1.)  # warmup? delta_clip?
Example #15
0
from patternmatching.gray.incremental.query_call import load_graph, parse_args
from patternmatching.gray.incremental.rl_model import GraphEnv

logging.basicConfig(level=logging.INFO)

policies = {
    "bqp": BoltzmannQPolicy(),  # Unstable
    "gqp": GreedyQPolicy(),
    "egqp": EpsGreedyQPolicy(eps=0.1)  # eps should be around 0.1
}

window_length = 5  # Should be less than 20 (too large value will not converge Q-values)
memories = {
    "epm": EpisodeParameterMemory(limit=20,
                                  window_length=window_length),  # Non-episodic
    "sm": SequentialMemory(limit=20,
                           window_length=window_length)  # should use this
}

argv = sys.argv
if len(argv) < 4:
    print("Usage: python %s [ConfFile] [Policy] [Memory]" % argv[0])
    exit(1)

policy_name = argv[2]
if not policy_name in policies:
    print("Please specify correct policy name: %s" % str(policies.keys()))
    exit(1)
policy = policies[policy_name]

memory_name = argv[3]
if not memories in memory_name:
Example #16
0
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(NODES))
model.add(PReLU())
model.add(Dense(NODES * 2))
model.add(PReLU())
model.add(Dense(NODES * 4))
model.add(PReLU())
model.add(Dense(NODES * 2))
model.add(PReLU())
model.add(Dense(nb_actions))
model.add(Activation('linear'))

memory = SequentialMemory(limit=memoria, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               batch_size=batch_size, target_model_update=1e-2, policy=policy,
               enable_double_dqn=True)
dqn.compile(Adam(lr=learning_rate), metrics=['mae'])

if not teste:
    dqn.fit(env, nb_steps=epocas, visualize=False, verbose=1)
    dqn.save_weights('dqn_weights.h5f', overwrite=True)
else:
    dqn.load_weights('dqn_weights_1.h5f')

dqn.test(env, nb_episodes=50, visualize=False)
Example #17
0
def main(args):
    # Supress waring message for CPU
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
    # Breakout environment name
    default_env = 'BreakoutDeterministic-v0'
    #default_env = 'BreakoutDeterministic-v4'
    
    window_length = 4
    nb_steps = 1750000
    # learning reate, based on later DeepMind paper called 
    # "Rainbow: Combining Improvements in Deep Reinforcement Learning" 
    # by Hessel et al. 2017 RMSProp was substituted for Adam 
    # with a learning rate of 0.0000625
    lr_rate = 0.0000625
    
    # Application mode: train, test
    # train: Training breakout deep qlearning network
    # test: Test breakout deep qlearning network with pre-trained model
#    default_app_mode = 'train'
    default_app_mode = 'test'
    
    # Whether check life lost for new episode
    default_life_lost_check = False
    
    if len(args) == 0:
        app_mode = default_app_mode
    else:
        app_mode = args[0]
        
    if len(args) > 1:
        if args[1] == 'v4':
            ENV_NAME = 'BreakoutDeterministic-v4'
        else:
            ENV_NAME = 'BreakoutDeterministic-v0'
    else:
        ENV_NAME = default_env

    if len(args) > 2:
        if args[2] == 'check_life_lost':
            life_check = True
        else:
            life_check = False
    else:
        life_check = default_life_lost_check
    
    INPUT_SHAPE = (84, 84)
    
    try: 
        env = gym.make(ENV_NAME)

        np.random.seed(123)
        env.seed(123)
        nb_actions = env.action_space.n
        
        input_frame = Input(shape=(window_length,) + INPUT_SHAPE)
        dqn_out = Permute((2, 3, 1))(input_frame)
        # Set he initializer for relu activation function
        dqn_out = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', 
                         kernel_initializer=he_normal())(dqn_out)
        dqn_out = Conv2D(64, (4, 4), strides=(2, 2), activation='relu',
                         kernel_initializer=he_normal())(dqn_out)
        dqn_out = Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
                         kernel_initializer=he_normal())(dqn_out)
        dqn_out = Flatten()(dqn_out)
        dqn_out = Dense(512)(dqn_out)
        dqn_out = LeakyReLU()(dqn_out)
        dqn_out = Dense(nb_actions)(dqn_out)
        dqn_out = Activation('linear')(dqn_out)
        model = Model(inputs=[input_frame], outputs=[dqn_out])
        
        print(model.summary())
        
        memory = SequentialMemory(limit=nb_steps, window_length=window_length)
        
        #policy = BoltzmannQPolicy()
        
#        policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 
#                                      attr='eps', 
#                                      value_max=1., 
#                                      value_min=.1, 
#                                      value_test=.05,
#                                      nb_steps=1000000)
#        
        policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 
                                      attr='eps', 
                                      value_max=.001, 
                                      value_min=.0001, 
                                      value_test=.00005,
                                      nb_steps=1000000)
                
        processor = AtariProcessor(input_shape=INPUT_SHAPE)
        
        dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, 
                       # Whether to enable dueling network
                       enable_dueling_network = False,
                       # Training starts afert warm-up steps 
                       nb_steps_warmup=50000,
                       processor=processor, 
                       #controls how often the target network is updated
                       target_model_update=10000, 
                       policy=policy, 
                       gamma=.99,
                       train_interval=4, 
                       delta_clip=1.,
                       
                       )
        
        dqn.compile(Adam(lr=lr_rate),
                    metrics=['mae'])
        
        weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME)
        log_filename = 'dqn_{}_log.json'.format(ENV_NAME)
        
        if app_mode == 'train':
            # Load existing weights if exists
            if os.path.exists(weights_filename):
                dqn.load_weights(weights_filename)
                        
            checkpoint_weights_filename = 'dqn_' + ENV_NAME + \
                                          '_weights_{step}.h5f'
            
            # Create step_logger and espisode_loger to monitor training
#            step_filename = 'dqn_' + ENV_NAME + '_step.csv'
#            episode_filename = 'dqn_' + ENV_NAME + '_episode.csv'
#            step_logger = StepLogger(step_filename)
#            episode_logger = EpisodeLogger(episode_filename)
#            
            callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, 
                                                 interval=250000)]
            callbacks += [FileLogger(log_filename, interval=100)]
            
            # Add step_logger and espisode_logger as callbacks
#            callbacks += [LambdaCallback(
#                            on_step_end=step_logger.on_step_end)]
#            callbacks += [LambdaCallback(
#                            on_episode_end=episode_logger.on_episode_end)]
#            
            dqn.fit(env, 
                    callbacks=callbacks, 
                    log_interval=10000,
                    nb_steps=nb_steps, 
                    visualize=False,
                    # To avoid hungs when all lifes are lost
                    nb_max_episode_steps=4000,
                    verbose=2,
                    #whether check life lost and start new episode
                    enable_life_lost_episode=life_check)
            
            # Save weights after training completed
            dqn.save_weights(weights_filename, overwrite=True)
            
            env.reset()
            
            # Evaluation 5 episodes to show training results
            dqn.test(env, nb_episodes=5, visualize=False)
            
        elif app_mode == 'test':
            awards_list = []
            #env.reset()
            csv_filename = 'dqn_' + ENV_NAME + '_test.csv'
            csv_logger = AwardLogger(csv_filename)
                        
            def print_test_logs(batch, logs):
                #print(batch)
                #print(logs)
                awards_list.append(logs['episode_reward'])
                
            callbacks = [LambdaCallback(on_episode_end=print_test_logs)]
            callbacks += [LambdaCallback(on_step_end=csv_logger.on_step_end)]
            callbacks += [LambdaCallback(
                            on_episode_end=csv_logger.on_episode_end)]
            dqn.load_weights(weights_filename)
            dqn.test(env, callbacks=callbacks, nb_episodes=5, 
                     visualize=True, nb_max_episode_steps=4000)
            
            mean_award = np.mean(awards_list)
            print('Average awards: {0:0.2f}'.format(mean_award))
            
            # show step-award diagram
            csv_logger.plot_award()
            
        elif app_mode == 'plot-model':
            model_filename = 'dqn_' + ENV_NAME + '_model.pdf'
            dql_model = dqn.model
            plot_model(dql_model, 
                       to_file=model_filename, 
                       show_shapes=True, 
                       show_layer_names=False,
                       rankdir='TB')
        elif app_mode == 'plot-train':
            
            json_log_file = 'dqn_' + ENV_NAME + '_log.json'
            records     = pd.read_json(json_log_file)
            fig, ax = plt.subplots(2)
#            plt.plot(records['episode'], records['loss'])
            fig.suptitle("Loss Value vs Espisode Reward")
            
            ax[0].plot(records['episode'], records['loss'], label='losss')
            ax[1].plot(records['episode'], records['episode_reward'], 
                          label='reward')
            
            ax[0].set_ylabel('Losss')
            ax[1].set_ylabel('Reward')
                        
            #plt.yticks([0, 0.005, 0.010, 0.050, 0.100])
            #plt.title('Loss Value / Mean Q',fontsize=12)
            #plt.legend(loc="upper left")
            ax[1].set_xlabel("Episode")
            #ax = plt.gca()
            #ax.set_xticklabels([])
            
            plt.show()
            
        else:
            print(f"Syntax: python sls_breakout.py " +
                  f"<train | test | plot-model | plot-train> "+
                  f"[v4|v2] [check_life_lost]")
    finally:
        if env is not None:
            env.close()
def main():
    set_gpu_option()
    # OPTIONS
    ENV_NAME = 'DDPGEnv-v0'
    TIME_STEP = 30

    # Get the environment and extract the number of actions.

    PATH_TRAIN = '/home/data/training_x_150.h5'
    PATH_TEST = '/home/data/test_x_150.h5'
    """
    env = OhlcvEnv(TIME_STEP, path=PATH_TRAIN)
    env_test = OhlcvEnv(TIME_STEP, path=PATH_TEST)
    """
    store = pd.HDFStore(PATH_TRAIN, mode='r')
    varieties_list = store.keys()
    print('varieties_list: ', varieties_list)
    print('num varieties: ', len(varieties_list))
    
    variety = 'RB'
    print('variety: ', variety)
    
    # get selected features
    SELECTED_FACTOR_PATH = '~/feature_selection/根据互信息选出的特征,根据重要性排序.csv'
    selected_factor_df = pd.read_csv(SELECTED_FACTOR_PATH, index_col=0)
    selected_factor_list = selected_factor_df[variety].to_list()
    
    env = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TRAIN, selected_factor_list=selected_factor_list)
    #env_test = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TEST,  selected_factor_list=selected_factor_list)

    # random seed
    np.random.seed(123)
    env.seed(123)

    nb_actions = env.action_space.shape[0]
    print('nb_actions: ', nb_actions)

    print('env.observation_space.shape: ', env.observation_space.shape)
    print('env.observation_space: ', env.observation_space)
    
    # create actor
    actor = create_actor(input_shape=env.shape, nb_actions=nb_actions)
    
    # create critic
    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=env.shape, name='observation_input')
    critic = create_critic(action_input, observation_input)
    


    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics!
    memory = SequentialMemory(limit=50000, window_length=TIME_STEP)

    random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
    ddpg = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99, target_model_update=1e-3, processor=DDPGProcessor())
    ddpg.compile(optimizer=Adam(lr=1e-3), metrics=['mae'])

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True)
    for _ in range(3):
        ddpg.fit(env, nb_steps=140000, nb_max_episode_steps=140000, visualize=False, verbose=2)

    """
def controllerb(t, joints, links, joint2, joint3, joint4, joint5, rewardb_ros,
                joint1, agent, graph2, session2):

    if agent.value is None:
        # import keras-rl in NRP through virtual env
        import site, os
        site.addsitedir(
            os.path.expanduser(
                '~/.opt/tensorflow_venv/lib/python2.7/site-packages'))
        from keras.models import Model, Sequential
        from keras.layers import Dense, Activation, Flatten, Input, concatenate
        from keras.optimizers import Adam, RMSprop
        from rl.agents import DDPGAgent
        from rl.memory import SequentialMemory
        from rl.random import OrnsteinUhlenbeckProcess
        from keras import backend as K

        from tensorflow import Session, Graph
        K.clear_session()

        obs_shape = (6, )

        nb_actions = 5

        # create the nets for rl agent
        # actor net

        graph2.value = Graph()
        with graph2.value.as_default():
            session2.value = Session()
            with session2.value.as_default():

                actor = Sequential()
                actor.add(Flatten(input_shape=(1, ) + obs_shape))
                actor.add(Dense(32))
                actor.add(Activation('relu'))
                actor.add(Dense(32))
                actor.add(Activation('relu'))
                actor.add(Dense(32))
                actor.add(Activation('relu'))
                actor.add(Dense(nb_actions))
                actor.add(Activation('sigmoid'))
                clientLogger.info('actor net init')

                # critic net
                action_input = Input(shape=(nb_actions, ), name='action_input')
                observation_input = Input(shape=(1, ) + obs_shape,
                                          name='observation_input')
                flattened_observation = Flatten()(observation_input)
                x = concatenate([action_input, flattened_observation])
                x = Dense(64)(x)
                x = Activation('relu')(x)
                x = Dense(64)(x)
                x = Activation('relu')(x)
                x = Dense(64)(x)
                x = Activation('relu')(x)
                x = Dense(1)(x)
                x = Activation('linear')(x)
                critic = Model(inputs=[action_input, observation_input],
                               outputs=x)
                clientLogger.info('critic net init')

                # instanstiate rl agent
                memory = SequentialMemory(limit=1000, window_length=1)
                random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                          mu=0.,
                                                          sigma=.2,
                                                          size=nb_actions)
                agent.value = DDPGAgent(nb_actions=nb_actions,
                                        actor=actor,
                                        critic=critic,
                                        critic_action_input=action_input,
                                        memory=memory,
                                        nb_steps_warmup_critic=10,
                                        nb_steps_warmup_actor=10,
                                        random_process=random_process,
                                        gamma=.99,
                                        batch_size=5,
                                        target_model_update=1e-3,
                                        delta_clip=1.)
                agent.value.training = True
                clientLogger.info('rl agent init')

                PATH = '/home/user/WORK/NRP/NRP-local/Experiments/bf_manipulation_demo/ddpg_weights.h5'
                if os.path.isfile(PATH):
                    print('loading weights')
                    agent.load_weights(PATH)
                    clientLogger.info('weights loaded')

                agent.value.compile(Adam(lr=.001, clipnorm=1.),
                                    metrics=['mae'])
                clientLogger.info('agent compiled - ready to use')

#### run steps

#graph1.value = Graph()
    with graph2.value.as_default():
        #       session1.value = Session()
        with session2.value.as_default():

            import math
            import numpy as np

            angle_lower = links.value.pose[5].position.x
            angle_vel_lower = links.value.pose[7].position.x
            angle_upper = links.value.pose[9].position.x
            angle_vel_upper = links.value.pose[12].position.x
            #  clientLogger.info('humerus_angle ', links.value.pose[15].position.y)
            #  clientLogger.info('humerus_ang_vel ', angle_vel_lower)
            #  clientLogger.info('radius_angle ', angle_upper)
            #  clientLogger.info('radius_ang_vel ', angle_vel_lower)

            observation = np.array([
                math.cos(angle_lower),
                math.sin(angle_lower), angle_vel_lower,
                math.cos(angle_upper),
                math.sin(angle_upper), angle_vel_upper
            ])

            # get movement action from agent and publish to robot
            action = agent.value.forward(observation)
            clientLogger.info('agent stepped foward')

            # move robot
            joint1.send_message(std_msgs.msg.Float64(action[0]))
            joint2.send_message(std_msgs.msg.Float64(-action[1]))
            joint3.send_message(std_msgs.msg.Float64(action[2]))
            joint4.send_message(std_msgs.msg.Float64(action[3]))
            joint5.send_message(std_msgs.msg.Float64(action[4]))

            import math
            reward = \
            math.sqrt(math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \
            math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \
            math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2))

            clientLogger.info('REWARD IS:', reward)
            rewardb_ros.send_message(reward)
            ## reward x müsste minimiert für runter!
            #-(angle_lower**2 + 0.1*angle_vel_lower**2 +
            #     angle_upper**2 + 0.1*angle_vel_upper**2 +
            #     0.001*np.sum(np.power(action, 2)))

            #learn from the reward
            agent.value.backward(reward)
            clientLogger.info('agent stepped backward')
            agent.value.step = agent.value.step + 1

            if agent.value.step % 20 == 0:
                clientLogger.info('saving weights')
                PATH = '/home/user/Desktop/keras_learning_weights/ddpg_weights_b.h5'
                agent.value.save_weights(PATH, overwrite=True)

            clientLogger.info('-------one step done')
Example #20
0
args = parser.parse_args()

# Get the environment and extract the number of actions.
env = gym.make(args.env_name)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
model = models.get_model(args.model, input_shape, nb_actions)

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=args.memory_size, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
    def test_quantile_regression(self):
        nb_inputs = 10
        nb_actions = 3
        nb_quantiles = 32
        batch_size = 64
        delta_clip = 1
        model = NetworkMLPDistributional(nb_inputs=nb_inputs,
                                         nb_outputs=nb_actions,
                                         nb_hidden_layers=2,
                                         nb_hidden_neurons=100,
                                         nb_quantiles=nb_quantiles,
                                         nb_cos_embeddings=64,
                                         duel=True,
                                         prior=False,
                                         activation='relu',
                                         duel_type='avg',
                                         window_length=1).model
        policy = LinearAnnealedPolicy(DistributionalEpsGreedyPolicy(eps=1),
                                      attr='eps',
                                      value_max=1.,
                                      value_min=0.1,
                                      value_test=.0,
                                      nb_steps=10000)
        test_policy = DistributionalEpsGreedyPolicy(eps=0)
        memory = SequentialMemory(limit=10000, window_length=1)
        agent = IQNAgent(model=model,
                         policy=policy,
                         test_policy=test_policy,
                         enable_double_dqn=True,
                         nb_samples_policy=nb_quantiles,
                         nb_sampled_quantiles=nb_quantiles,
                         cvar_eta=1,
                         nb_actions=nb_actions,
                         memory=memory,
                         gamma=0.99,
                         batch_size=batch_size,
                         nb_steps_warmup=1000,
                         train_interval=1,
                         memory_interval=1,
                         target_model_update=1000,
                         delta_clip=delta_clip)

        agent.compile(Adam(lr=0.0001))
        plot_model(agent.trainable_model,
                   to_file='trainable_model_2.png',
                   show_shapes=True)

        # Test input
        states = np.random.rand(batch_size, 1, nb_inputs)
        actions = np.random.randint(nb_actions, size=batch_size)
        test_quantiles = np.linspace(0, 1, nb_quantiles)
        z_values = agent.model.predict_on_batch(
            [states, test_quantiles[None, None, :]])
        # print(z_values[0])

        for i in range(3000):
            quantiles = np.random.rand(batch_size, 1, nb_quantiles)
            # targets = np.random.choice([1, 2, 3], batch_size)
            targets = np.random.choice([10, 22, 35], batch_size)
            targets = np.repeat(targets[:, None], nb_quantiles, axis=1)

            predictions = agent.model.predict_on_batch([states, quantiles])

            masks = np.zeros((batch_size, nb_actions))
            masks[range(batch_size), actions] = 1
            targets_expanded = np.zeros((batch_size, nb_quantiles, nb_actions))
            targets_expanded[range(batch_size), :,
                             actions] = targets[range(batch_size), :]

            loss = agent.trainable_model.predict_on_batch(
                [states, quantiles, targets_expanded, masks])

            metrics = agent.trainable_model.train_on_batch(
                [states, quantiles, targets_expanded, masks],
                [targets, targets_expanded])

            if np.mod(i, 100) == 0:
                test_quantiles = np.linspace(0, 1, nb_quantiles)
                z_values = agent.model.predict_on_batch(
                    [states, test_quantiles[None, None, :]])

        self.assertTrue(np.abs(np.mean(z_values[:, 1:10, :]) - 10) < 1.0)
        self.assertTrue(np.abs(np.mean(z_values[:, 12:20, :]) - 22) < 1.0)
        self.assertTrue(np.abs(np.mean(z_values[:, 23:31, :]) - 35) < 1.0)
Example #22
0
def train_dqn_model(layers, rounds=10000):

    env = gym.make(ENV_NAME)
    env.seed(1)
    nb_actions = env.action_space.n
    window_length = 1

    print "nb_actions:"
    print nb_actions
    print "env.observation_space.shape:"
    print env.observation_space.shape

    model = generate_dense_model(
        (window_length, ) + env.observation_space.shape, layers, nb_actions)

    policy = EpsGreedyQPolicy()

    memory = SequentialMemory(limit=1000000,
                              ignore_episode_boundaries=False,
                              window_length=window_length)

    agent = DQNAgent(model=model,
                     nb_actions=nb_actions,
                     memory=memory,
                     nb_steps_warmup=16,
                     enable_double_dqn=True,
                     enable_dueling_network=True,
                     dueling_type='avg',
                     target_model_update=1e-2,
                     policy=policy,
                     batch_size=16)

    agent.compile(RMSprop(lr=1e-3), metrics=['mae'])

    #tb_cb = TensorBoard(log_dir='/tmp/log', write_images=1, histogram_freq=1)
    #cbks = [tb_cb]
    # play the game. learn something!
    #nb_max_episode_steps 一次学习周期中最大步数
    agent.fit(env,
              nb_steps=rounds,
              nb_max_episode_steps=nb_max_episode_steps_train,
              visualize=False,
              verbose=2)

    #print "#################Start Test%################"

    #agent.test(env, nb_episodes=100)

    #test_samples=samples_test

    features_extract = Features(vocabulary_file)
    spam_checker = Spam_Check()
    # 根据动作修改当前样本免杀
    spam_manipulatorer = Spam_Manipulator()

    success = 0
    sum = 0

    shp = (1, ) + tuple(model.input_shape[1:])

    for sample in samples_test:
        #print sample
        sum += 1

        for _ in range(nb_max_episode_steps_test):
            featurevectors = features_extract.extract(sample)
            if spam_checker.check_spam(featurevectors) < 1.0:
                success += 1
                print "Bypass spam rule!:"
                print sample
                break

            f = features_extract.extract(sample).reshape(shp)
            act_values = model.predict(f)
            action = np.argmax(act_values[0])
            sample = spam_manipulatorer.modify(sample, ACTION_LOOKUP[action])

    print "Sum:{} Success:{}".format(sum, success)

    return agent, model
Example #23
0
mode = 'predict' if len(sys.argv) < 2 else sys.argv[1]
#env = gym.make(ENV_NAME)
config_path = PROJECT_ROOT / "data/esquare3/config_engine.json"
config = json.load(config_path.open('rt'))

env = CityFlowAgent(mode='train', config_path=config_path)

#np.random.seed(123)
#env.seed(123)

model = env.get_model()
model.summary()

weights_filename = env.weights_filename
log_filename = 'dqn_{}_log.json'.format(ENV_NAME)
memory = SequentialMemory(limit=1000000,
                          window_length=env.config['WINDOW_LENGTH'])
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=150.,
                              value_min=0.0,
                              value_test=.05,
                              nb_steps=10000)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model,
               nb_actions=env.action_space.n,
               policy=policy,
               memory=memory,
               nb_steps_warmup=500,
               gamma=.9,
               target_model_update=1000,
               train_interval=100,
Example #24
0
# Input Layer
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))

# Hidden layers
for _ in range(NUM_HIDDEN_LAYERS):
    model.add(Dense(LAYER_SIZE))
    model.add(Activation('relu'))

# Output layer
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=NUM_STEPS, window_length=1)
# train_policy = BoltzmannQPolicy(tau=0.05)
train_policy = EpsGreedyQPolicy()
test_policy = GreedyQPolicy()

# Compile the agent based on method specified. We use .upper() to convert to
# upper case for comparison
if METHOD.upper() == 'DUEL_DQN':
    memory = SequentialMemory(limit=NUM_STEPS, window_length=1)
    agent = DQNAgent(model=model,
                     nb_actions=nb_actions,
                     memory=memory,
                     nb_steps_warmup=100,
                     enable_dueling_network=True,
                     dueling_type='avg',
                     target_model_update=1e-2,
Example #25
0
x = Concatenate()([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
x = Activation('linear')(x)
L_model = Model(inputs=[action_input, observation_input], outputs=x)
print(L_model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
processor = PendulumProcessor()
memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGHT)
random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                          mu=0.,
                                          sigma=.3,
                                          size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions,
                 V_model=V_model,
                 L_model=L_model,
                 mu_model=mu_model,
                 memory=memory,
                 nb_steps_warmup=NB_STEPS_WARMUP,
                 random_process=random_process,
                 gamma=GAMMA,
                 target_model_update=TARGET_MODEL_UPDATE,
                 processor=processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
Example #26
0
def training_game():
    env = Environment(
        map_name="ForceField",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.2,
                                  value_test=.0,
                                  nb_steps=1e2)

    # Agent

    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        memory=memory,
        enable_double_dqn=True,
        enable_dueling_network=True,
        # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup:
        #                   nb_steps_warmup=300, target_model_update=1e-2, policy=policy,
        nb_steps_warmup=500,
        target_model_update=1e-2,
        policy=policy,
        batch_size=150,
        processor=processor,
        delta_clip=1)

    dqn.compile(Adam(lr=.001), metrics=["mae", "acc"])

    # Tensorboard callback

    timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}"
    # 2019-07-12 GU Zhan (Sam) folder name for Lunux:
    #    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0,
    #                                write_graph=True, write_images=False)

    # 2019-07-12 GU Zhan (Sam) folder name for Windows:
    callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = "agent"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    class Saver(Callback):
        def on_episode_end(self, episode, logs={}):
            if episode % 200 == 0:
                self.model.save_weights(w_file, overwrite=True)

    s = Saver()
    logs = FileLogger('DQN_Agent_log.csv', interval=1)

    dqn.fit(env,
            callbacks=[callbacks, s, logs],
            nb_steps=600,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
Example #27
0
    print(model.summary())

    Agent = AGENT_DIC[args.agent]
    if args.agent == 'cem':
        memory = EpisodeParameterMemory(limit=args.memory_limit,
                                        window_length=args.batch_size)
        agent = Agent(model=model,
                      nb_actions=nb_actions,
                      memory=memory,
                      batch_size=args.batch_size,
                      nb_steps_warmup=args.steps_warmup,
                      train_interval=1,
                      elite_frac=args.elite_frac)
        agent.compile()
    elif args.agent == 'dqn':
        memory = SequentialMemory(limit=args.memory_limit,
                                  window_length=args.batch_size)
        policy = BoltzmannQPolicy()
        agent = DQNAgent(model=model,
                         nb_actions=nb_actions,
                         memory=memory,
                         batch_size=args.batch_size,
                         nb_steps_warmup=args.steps_warmup,
                         target_model_update=1e-2,
                         policy=policy)
        agent.compile(Adam(lr=1e-3), metrics=['mae'])
    if args.train == 1:
        if not args.wandb_flag:
            # import ipdb; ipdb.set_trace()
            agent.fit(env,
                      nb_steps=args.nb_steps_train,
                      visualize=False,
Example #28
0
    def train(
        self,
        env,
        input_fn,
        max_steps=10000,
        policy=EpsGreedyQPolicy(),
        memory=SequentialMemory(limit=1000, window_length=1),
        target_model_update=10000,
        gamma=0.99,
        warmup_steps=None,
        batch_size=64,
        summary_steps=100,
        visualize=False,
    ):

        min_memory = max(
            warmup_steps,
            batch_size) if warmup_steps is not None else batch_size

        with tf.Graph().as_default() as graph:

            inputs = input_fn()

            print(inputs)

            tf.train.get_or_create_global_step()

            #####################
            # start model_fn

            with tf.variable_scope("Model") as model_scope:
                model_inputs = dict(state=inputs["state0"])
                model_q_values = self.model_fn(model_inputs,
                                               tf.estimator.ModeKeys.TRAIN,
                                               self.params)

            with tf.variable_scope("Model", reuse=True) as predict_scope:
                model_inputs = dict(state=inputs["state0"])
                predict_q_values = self.model_fn(model_inputs,
                                                 tf.estimator.ModeKeys.PREDICT,
                                                 self.params)

            with tf.variable_scope("TargetModel") as target_scope:
                target_model_inputs = dict(state=inputs["state1"])
                target_q_values = self.model_fn(target_model_inputs,
                                                tf.estimator.ModeKeys.PREDICT,
                                                self.params)

            # get variables
            model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope=model_scope.name)
            target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope=target_scope.name)

            not_terminal = 1.0 - tf.cast(inputs["terminal"], tf.float32)
            action_values = tf.reduce_max(target_q_values, axis=1)
            target_values = inputs[
                "reward"] + gamma * action_values * not_terminal

            assert action_values.get_shape().as_list(
            ) == inputs["reward"].get_shape().as_list()
            assert action_values.get_shape().as_list(
            ) == not_terminal.get_shape().as_list()

            model_action_values = utils.select_columns(model_q_values,
                                                       inputs["action"])

            tf.losses.mean_squared_error(target_values, model_action_values)
            # loss = tf.reduce_mean(loss)

            loss = tf.losses.get_total_loss()

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.learning_rate)

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                train_op = optimizer.minimize(
                    loss,
                    global_step=tf.train.get_global_step(),
                    var_list=tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope=model_scope.name),
                )

            tf.summary.scalar("target", tf.reduce_mean(target_values))

            # end model_fn
            #####################

            #####################
            # train stuff

            tf.summary.scalar("loss", loss)

            train_summaries = tf.summary.merge_all()

            if target_model_update >= 1:
                update_target_op = tf.cond(
                    # global_step % target_model_update == 0
                    tf.equal(
                        tf.mod(tf.train.get_global_step(),
                               target_model_update),
                        0,
                    ),
                    lambda: update_target_weights_hard(target_variables,
                                                       model_variables),
                    lambda: tf.no_op(),
                )
            else:
                update_target_op = update_target_weights_soft(
                    target_variables, model_variables, target_model_update)

            final_train_op = tf.group(
                train_op,
                update_target_op,
            )

            # train stuff
            #####################

            #####################
            # episode stuff

            episode_length_t = tf.placeholder(tf.int32, name="episode_length")
            episode_reward_t = tf.placeholder(tf.int32, name="episode_reward")

            episode_length_summary = tf.summary.scalar("episode_length",
                                                       episode_length_t)
            episode_reward_summary = tf.summary.scalar("episode_reward",
                                                       episode_reward_t)

            final_episode_op = tf.group()
            episode_summaries = tf.summary.merge(
                [episode_length_summary, episode_reward_summary])

            # episode stuff
            #####################

            state0_t, reward_t, terminal_t, action_t, state1_t = [
                inputs[x]
                for x in ["state0", "reward", "terminal", "action", "state1"]
            ]

            global_variables_initializer = tf.global_variables_initializer()

        graph.finalize()

        writer = tf.summary.FileWriter(self.model_dir)

        with tf.Session(graph=graph) as sess:

            utils.initialize_or_restore(sess, self.model_dir,
                                        global_variables_initializer)

            current_step = sess.run(tf.train.get_global_step())

            state0 = env.reset()

            _episode_length = 0
            _episode_reward = 0.0

            for step in range(current_step, max_steps):

                step_feed = {state0_t: [state0]}

                predictions = sess.run(predict_q_values, step_feed)

                action = policy.select_action(q_values=predictions[0])

                state1, reward, terminal, _info = env.step(action)

                if visualize:
                    env.render()

                #
                _episode_length += 1
                _episode_reward += reward
                #

                memory.append(state0, action, reward, terminal)

                train_fetches = {}
                train_feed = {}

                if memory.nb_entries > min_memory:
                    experiences = memory.sample(batch_size)
                    experiences = [list(x) for x in zip(*experiences)]

                    state0_a, action_a, reward_a, state1_a, terminal_a = experiences

                    state0_a = np.squeeze(state0_a)
                    state1_a = np.squeeze(state1_a)

                    train_feed.update({
                        state0_t: state0_a,
                        action_t: action_a,
                        reward_t: reward_a,
                        state1_t: state1_a,
                        terminal_t: terminal_a,
                    })

                    train_fetches["train_op"] = final_train_op

                    if step % summary_steps == 0:
                        train_fetches["train_summaries"] = train_summaries

                if terminal:
                    train_feed[episode_length_t] = _episode_length
                    train_feed[episode_reward_t] = _episode_reward

                    train_fetches["episode_op"] = final_episode_op
                    train_fetches["episode_summaries"] = episode_summaries

                if step % summary_steps == 0:
                    pass

                # do training
                results = sess.run(train_fetches, train_feed)

                if "train_summaries" in results:
                    writer.add_summary(
                        results["train_summaries"],
                        step,
                    )

                if "episode_summaries" in results:
                    writer.add_summary(
                        results["episode_summaries"],
                        step,
                    )

                # end step
                if terminal:
                    state0 = env.reset()
                    #
                    _episode_length = 0
                    _episode_reward = 0.0
                    #

                else:
                    state0 = state1
Example #29
0
nb_actions = env.action_space.n

# agent network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(32*scale))
model.add(Activation('relu'))
model.add(Dense(16*scale))
model.add(Activation('relu'))
model.add(Dense(8*scale))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='softmax'))
print(model.summary())

# spcifications for the RL agent
memory = SequentialMemory(limit=replay_size, window_length=win_len)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup,
               enable_dueling_network=True, dueling_type='avg', target_model_update=target_model_update, policy=policy)

# compiling the model
dqn.compile(Adam(lr=lrn_rate), metrics=['mae'])

# setting up callbacks for result collection and realtime visualization of the results through tensorboard
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
tpl = TrainEpisodeLogger()

# finally perform the training----- visualize=False enables training without visualizing the game which speeds up the training process
dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=2, callbacks=[tensorboard, tpl], nb_max_episode_steps=nb_max_episode_steps)

# save the model weights
Example #30
0
    model.add(Dense(50))
    model.add(Activation('relu'))
    model.add(Dense(100))

    return model


# In[13]:

model = nn_model()
model.summary()

# In[21]:

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model,
               nb_actions=100,
               memory=memory,
               nb_steps_warmup=100,
               target_model_update=1e-3,
               policy=policy)
dqn.compile(Adam(lr=1e-4), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot.
history = dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)

# In[19]:

print(history)
def training_game():
    env = Environment(
        map_name="HallucinIce",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.2,
                                  value_test=.0,
                                  nb_steps=1e2)

    # Agent

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   memory=memory,
                   enable_double_dqn=True,
                   enable_dueling_network=True,
                   nb_steps_warmup=500,
                   target_model_update=1e-2,
                   policy=policy,
                   batch_size=150,
                   processor=processor,
                   delta_clip=1)

    dqn.compile(Adam(lr=.001), metrics=["mae", "acc"])

    # Tensorboard callback

    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = "HallDebbugeed"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    dqn.fit(env,
            callbacks=[callbacks],
            nb_steps=1e7,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)