Ejemplo n.º 1
0
def create(env):
    np.random.seed(config.current.domain_seed)
    env.seed(config.current.domain_seed)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(config.current.agent_vfn_complexity))
    model.add(Activation('relu'))
    model.add(Dense(config.current.agent_vfn_complexity))
    model.add(Activation('relu'))
    model.add(Dense(config.current.agent_vfn_complexity))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    global graph
    graph = tf.get_default_graph()

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
    return sarsa
def init_sarsa(env, nb_actions, lr=1e-3):
    """ Initialize the Sarsa agent using the keras-rl package.

    :param env: the environment to be played, required to determine the input size
    :param nb_actions: number of actions
    :param lr: learning rate
    :return: Sarsa Agent
    """
    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
    sarsa.model_name = f"SARSA"
    sarsa.compile(Adam(lr=lr), metrics=['mae'])
    return sarsa
Ejemplo n.º 3
0
def run_sarsa():

    global N_NODE_NETWORK

    env = SnakeGymDiscrete()
    nb_actions = env.action_space.n

    # initialize randomness
    np.random.seed(123)
    env.seed(123)

    # create model
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)
    sarsa.save_weights('sarsa_SnakeGymDiscrete_weights.h5f', overwrite=True)

    sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 4
0
def create_sarsa_agent(env):
    env = create_environment()
    model = create_deep_model(env)
    nb_actions = env.action_space.n
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
    return sarsa
Ejemplo n.º 5
0
    def __init__(self, state_dim, action_space, epsilon, gamma, lr):
        self._model = self._get_model(state_dim, action_space)
        self.agent = SARSAAgent(self._model,
                                nb_actions=action_space,
                                gamma=gamma,
                                policy=EpsGreedyQPolicy(epsilon),
                                test_policy=EpsGreedyQPolicy(eps=0.01))

        self.agent.compile(Adam(lr))
Ejemplo n.º 6
0
def main():
    # binance = DataReader()
    env = BinanceEnv()
    # binance.get_recent_trades()
    # env.next_observation()
    # binance_market = BinanceMarket()
    # binance_market.long()
    # time.sleep(3)
    # binance_market.close_long()
    # time.sleep(3)
    # binance_market.short()
    # time.sleep(3)
    # binance_market.close_short()
    # binance_market.update_positions()
    # print(binance_market.balance)

    # episodes = 10
    # for episode in range(1, episodes + 1):
    #     # At each begining reset the game
    #     state = env.reset()
    #     # set done to False
    #     done = False
    #     # set score to 0
    #     score = 0
    #     # while the game is not finished
    #     while not done:
    #         # visualize each step
    #         env.render()
    #         # choose a random action
    #         action = random.randint(0, 5)
    #         # execute the action
    #         n_state, reward, done, info = env.step(action)
    #         # keep track of rewards
    #         score += reward
    #     print('episode {} score {}'.format(episode, score))

    model = agent(env.observation_space.shape[0], env.action_space.n)
    policy = EpsGreedyQPolicy()
    sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
    sarsa.compile('adam', metrics=['mse', 'accuracy'])
    # sarsa.load_weights('sarsa_weights_bnb_07.h5f')
    env.is_testing = False
    sarsa.fit(env, nb_steps=100000, visualize=False, verbose=1)
    sarsa.save_weights('sarsa_weights_bnb_07_1.h5f', overwrite=True)
    # sarsa.load_weights('sarsa_weights_bnb_07_1.h5f')
    # env.simulator = False
    env.is_testing = True
    scores = sarsa.test(env, nb_episodes=1, visualize=False)
    print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward'])))

    _ = sarsa.test(env, nb_episodes=10, visualize=True)
    obs = env.reset()
    for i in range(2000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
Ejemplo n.º 7
0
 def __init__(self, env, timesteps_per_episode=10001):
     super().__init__(env, timesteps_per_episode)
     self.num_episodes = 400
     self.evaluating = False
     self.action_size = env.action_space.n
     self.state_size = env.num_states
     self.model = self._build_compile_model()
     self.agent = SARSAAgent(model=self.model,
                             nb_actions=self.action_size,
                             policy=EpsGreedyQPolicy())
Ejemplo n.º 8
0
def main():

    model = Sequential()
    model.add(Flatten(input_shape=(1, 7)))
    model.add(Dense(units=20, activation='relu'))
    model.add(Dense(units=20, activation='relu'))
    model.add(Dense(units=6, activation='linear'))
    logger.info(model.summary())

    steps = 1E9
    interval = steps // 100

    # policy = MyPolicy()
    policy = BoltzmannQPolicy()
    agent = SARSAAgent(model=model, nb_actions=6, policy=policy, train_interval=10, nb_steps_warmup=10)

    adam = Adam()
    sgd = SGD(lr=1e-3, momentum=0, decay=0, nesterov=False)
    agent.compile(optimizer=adam, metrics=['mse'])

    env = MyEnv()
    agent.fit(env, steps, verbose=2, visualize=True)

    fp = Path(__file__).resolve().parent / 'sarsa_weights.h5f'
    agent.save_weights(fp, overwrite=True)

    logger.info('Done')
Ejemplo n.º 9
0
def main():

    # nb_actions = cpst._action_space
    nb_actions = 2
    # Next, we build a very simple model.
    model = Sequential()
    #n_os = cpst._observation_space.shape

    n_os = 4
    model.add(Flatten(input_shape=[1] + [n_os]))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    print(model.summary())
    model._make_predict_function()

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    cart_pole = CartPole(name='cp')

    log = logging.getLogger('bact2')

    RE = RunEngine({})
    RE.log.setLevel('DEBUG')
    cart_pole.log = RE.log

    stm = [cart_pole.x, cart_pole.x_dot, cart_pole.theta, cart_pole.theta_dot]
    cpst = CartPoleEnv(detectors=[cart_pole],
                       motors=[cart_pole],
                       state_motors=stm,
                       user_kwargs={'mode_var': cart_pole.rl_mode})

    np.random.seed(123)
    cpst.seed(123)

    partial = functools.partial(run_test, sarsa, cpst, log=RE.log)
    RE(run_environement(cpst, partial, log=RE.log))
Ejemplo n.º 10
0
 def compile_agent(self):
     # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
     # even the metrics!
     processor = DistopiaProcessor(self.num_blocks, self.num_actions)
     #memory = SequentialMemory(limit=50000, window_length=1)
     #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
     #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
     policy = BoltzmannQPolicy()
     test_policy = GreedyQPolicy()
     self.sarsa = SARSAAgent(model=self.model,
                             processor=processor,
                             nb_actions=self.nb_actions,
                             nb_steps_warmup=1000,
                             policy=policy,
                             test_policy=test_policy,
                             gamma=0.9)
     self.sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
Ejemplo n.º 11
0
def main():
    # with ServerProxy("http://127.0.0.1:8000/", verbose=False, allow_none=True) as proxy:
    if True:
        pass

    #D:\Devel\github\keras-rl;D:\Devel\github\Devel\hz-b\naus
    # set PYTHONPATH=D:\Devel\github\keras-rl;D:\Devel\github\Devel\hz-b\naus
    # & python d:\Devel\github\Devel\hz-b\naus\examples\rl\cart_pole\sarsa_cartpole.py

    def stop_my_application():
        print('Stopping application')

    with allow_interrupt():
        # main polling loop.

        env = EnvironmentProxyForClient(receiver=None)
        np.random.seed(1974)
        env.seed(1974)

        env.reset()

        # nb_actions = cpst._action_space
        nb_actions = 2
        # Next, we build a very simple model.
        model = Sequential()
        #n_os = cpst._observation_space.shape

        n_os = 4
        model.add(Flatten(input_shape=[1] +[n_os]))
        model.add(Dense(16))
        model.add(Activation('relu'))
        model.add(Dense(16))
        model.add(Activation('relu'))
        model.add(Dense(16))
        model.add(Activation('relu'))
        model.add(Dense(nb_actions))
        model.add(Activation('linear'))
        print(model.summary())

        # SARSA does not require a memory.
        policy = BoltzmannQPolicy()
        sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
        sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

        run_test(sarsa, env, log=log)
Ejemplo n.º 12
0
def keras_rl(env,
             model_name,
             saved_model_name="model",
             steps=50000,
             test_steps=5,
             visualize=False,
             hidden_layers=3,
             critic_hidden_layers=3):
    nb_actions = 0
    if (model_name == "DQN" or model_name == "SARSA"):
        nb_actions = env.action_space.n
    elif (model_name == "DDPG"):
        nb_actions = env.action_space.shape[0]

    model_structure = define_layers(env,
                                    nb_actions,
                                    num_of_hidden_layers=hidden_layers)
    memory = define_memory()
    policy = define_policy(model_name)

    if (model_name == "DQN"):
        model = DQNAgent(model=model_structure,
                         nb_actions=nb_actions,
                         memory=memory,
                         nb_steps_warmup=100,
                         enable_double_dqn=True,
                         dueling_type='avg',
                         target_model_update=1e-2)
    elif (model_name == "SARSA"):
        model = SARSAAgent(model=model_structure,
                           nb_actions=nb_actions,
                           nb_steps_warmup=10,
                           policy=policy)
    elif (model_name == "DDPG"):
        action_input, critic_layers = define_critic_layers(
            env, num_of_hidden_layers=critic_hidden_layers)
        random_process = define_random_process(nb_actions)
        model = DDPGAgent(nb_actions=nb_actions,
                          actor=model_structure,
                          critic=critic_layers,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=100,
                          nb_steps_warmup_actor=100,
                          random_process=random_process,
                          gamma=.99,
                          target_model_update=1e-3)

    model.compile(Adam(lr=1e-3), metrics=['mae'])
    model.fit(env, nb_steps=steps, visualize=False, verbose=2)
    model.save_weights('{}.h5f'.format(model_name), overwrite=True)
    model.test(env, nb_episodes=test_steps, visualize=visualize)
Ejemplo n.º 13
0
def run_sarsa_agent(driver, queries, candidate_indices, tuning_config):
    # Get the environment and extract the number of actions.
    env = gym.make("udo_optimization-v0", driver=driver, queries=queries, candidate_indices=candidate_indices,
                   config=tuning_config)
    env.horizon = tuning_config['horizon']

    nb_actions = env.action_space.n
    logging.info(f"nr action: {nb_actions}")
    logging.info(f"observation space: {env.observation_space.shape}")

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(52))
    model.add(Activation('relu'))
    model.add(Dense(252))
    model.add(Activation('relu'))
    model.add(Dense(526))
    model.add(Activation('relu'))
    model.add(Dense(252))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    logging.info(model.summary())

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    # policy.select_action()
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    sarsa.fit(env, nb_steps=500, visualize=False, verbose=2)

    # After training is done, we save the final weights.
    # sarsa.save_weights('sarsa_{}_weights.h5f'.format(udo_optimization-v0), overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    sarsa.test(env, nb_episodes=5, visualize=False)
    env.print_state_summary(env.best_state)
Ejemplo n.º 14
0
def get_agent(agent_type, model_type, lr):
    if agent_type == "sarsa":
        policy = BoltzmannQPolicy()
        model = get_model(model_type)
        agent = SARSAAgent(model=model,
                           policy=policy,
                           nb_actions=nb_actions,
                           nb_steps_warmup=10,
                           gamma=0.99)
        agent.compile(Adam(lr), metrics=['mae'])
        return agent
    elif agent_type == "dqn":
        policy = BoltzmannQPolicy()
        model = get_model(model_type)
        memory = SequentialMemory(limit=50000, window_length=1)
        agent = DQNAgent(model=model,
                         policy=policy,
                         nb_actions=nb_actions,
                         memory=memory,
                         nb_steps_warmup=10,
                         target_model_update=1e-2,
                         enable_double_dqn=True)
        agent.compile(Adam(lr), metrics=['mae'])
        return agent
    elif agent_type == "a2c":
        agent = A2CAgent(nb_actions,
                         len(env.observation_space.high),
                         nb_steps_warmup=10,
                         actor_lr=0.001,
                         critic_lr=0.005)
        agent.compile(Adam(lr))
        return agent
    elif agent_type == "ppo":
        pass
    else:
        print("Unsupported model")
        exit(1)
Ejemplo n.º 15
0
def test_sarsa():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    policy = EpsGreedyQPolicy(eps=.1)
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
    sarsa.compile(Adam(lr=1e-3))

    sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
    policy.eps = 0.
    h = sarsa.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
Ejemplo n.º 16
0
def test_sarsa():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    policy = EpsGreedyQPolicy(eps=.1)
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
    sarsa.compile(Adam(lr=1e-3))

    sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
    policy.eps = 0.
    h = sarsa.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
Ejemplo n.º 17
0
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 18
0
env = gym.make('CartPole-v1')
states = env.observation_space.shape[0]
actions = env.action_space.n


def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


model = agent(env.observation_space.shape[0], env.action_space.n)

from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy

sarsa = SARSAAgent(model=model,
                   policy=EpsGreedyQPolicy(),
                   nb_actions=env.action_space.n)
# sarsa.compile('adam', metrics = ['mse'])
# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
# scores = sarsa.test(env, nb_episodes = 100, visualize= True)

# sarsa.save_weights('1-sarsa_weights.h5f', overwrite=True)
sarsa.load_weights('1-sarsa_weights.h5f')
_ = sarsa.test(env, nb_episodes=100, visualize=True)
print('Average score over 100 test games:{}'.format(
    np.mean(_.history['episode_reward'])))
Ejemplo n.º 19
0
class DistopiaSARSA:
    def __init__(self,
                 env_name='distopia-initial4-v0',
                 in_path=None,
                 out_path=None,
                 terminate_on_fail=False,
                 reconstruct=False):
        self.ENV_NAME = env_name
        self.filename = self.ENV_NAME
        self.init_paths(in_path, out_path)
        self.init_env(terminate_on_fail)
        self.init_model(reconstruct)
        self.compile_agent()

    def init_paths(self, in_path, out_path):
        self.in_path = in_path  #if self.in_path != None else './'
        self.out_path = out_path if out_path != None else './'
        self.log_path = "./logs/{}".format(time.time())
        os.mkdir(self.log_path)

    def init_env(self, terminate_on_fail):
        self.env = gym.make(self.ENV_NAME)
        self.env.terminate_on_fail = terminate_on_fail
        self.env.record_path = "{}/ep_".format(self.log_path)
        self.env = gym.wrappers.Monitor(self.env, "recording", force=True)
        np.random.seed(234)
        self.env.seed(234)
        self.nb_actions = np.sum(self.env.action_space.nvec)
        self.num_actions = self.env.NUM_DIRECTIONS
        self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT

    def init_model(self, reconstruct=False):
        if self.in_path != None:
            if reconstruct == True:
                self.construct_model()
            else:
                yaml_file = open(
                    "{}/{}.yaml".format(self.in_path, self.filename), 'r')
                model_yaml = yaml_file.read()
                yaml_file.close()
                self.model = model_from_yaml(model_yaml)
            self.model.load_weights("{}/{}.h5".format(self.in_path,
                                                      self.filename))
        else:
            # Next, we build a very simple model.
            self.construct_model()
        self.save_model()
        print(self.model.summary())

    def construct_model(self):
        self.model = Sequential()
        self.model.add(
            Flatten(input_shape=(1, ) + self.env.observation_space.shape))
        self.model.add(Dense(64))
        self.model.add(Activation('relu'))
        self.model.add(Dense(64))
        self.model.add(Activation('relu'))
        # self.model.add(Dense(16))
        # self.model.add(Activation('relu'))
        self.model.add(Dense(self.nb_actions))
        self.model.add(Activation('linear'))

    def save_model(self):
        if self.out_path != None:
            with open(self.filename + ".yaml", 'w+') as yaml_file:
                yaml_file.write(self.model.to_yaml())
            self.model.save_weights('{}/{}.h5'.format(self.out_path,
                                                      self.ENV_NAME))

    def compile_agent(self):
        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        processor = DistopiaProcessor(self.num_blocks, self.num_actions)
        #memory = SequentialMemory(limit=50000, window_length=1)
        #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
        #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
        policy = BoltzmannQPolicy()
        test_policy = GreedyQPolicy()
        self.sarsa = SARSAAgent(model=self.model,
                                processor=processor,
                                nb_actions=self.nb_actions,
                                nb_steps_warmup=1000,
                                policy=policy,
                                test_policy=test_policy,
                                gamma=0.9)
        self.sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    def train(self, max_steps=100, episodes=100):
        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        self.env._max_steps = max_steps
        #for i in range(episodes):
        self.env.current_step = 0
        n_steps = max_steps * episodes
        logger = FileLogger(
            filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME))
        self.sarsa.fit(self.env,
                       nb_steps=n_steps,
                       nb_max_episode_steps=max_steps,
                       visualize=False,
                       verbose=1,
                       callbacks=[logger])
        #self.env.reset()

        # After episode is done, we save the final weights.
        self.sarsa.save_weights('{}/{}.h5'.format(self.out_path,
                                                  self.ENV_NAME),
                                overwrite=True)

    def test(self):
        # Finally, evaluate our algorithm for 5 episodes.
        self.sarsa.test(self.env,
                        nb_episodes=5,
                        nb_max_start_steps=0,
                        visualize=True)
    def agent(states, actions):
        model = Sequential()
        model.add(Flatten(input_shape=(1, states)))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(actions, activation="linear"))
        return model

    model = agent(env.observation_space.shape[0], env.action_space.n)



    policy = EpsGreedyQPolicy()

    sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)

    sarsa.compile("adam", metrics=["mse"])

    sarsa.fit(env, nb_steps=10000, visualize=False, verbose=1)


    ready()


    scores = sarsa.test(env, nb_episodes=5, visualize=True)

    print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward'])))

    #sarsa.save_weights('sarsa_weights.h5f', overwrite=True) # save trained weights
Ejemplo n.º 21
0
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=10, visualize=True)
Ejemplo n.º 22
0
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Build the policy.
policy = BoltzmannQPolicy()
#policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
#                               nb_steps=10000)

if args.use_sarsa:
    # SARSA does not require a memory.
    agent = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
else:
    memory = SequentialMemory(limit=50000, window_length=1)
    agent = DQNAgent(model=model,
                     memory=memory,
                     nb_actions=nb_actions,
                     nb_steps_warmup=50,
                     policy=policy)

agent.compile(Adam(lr=args.learning_rate), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=args.n_steps, visualize=False, verbose=2)
Ejemplo n.º 23
0
def train():
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False)
    # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True)
    if not SMOOTH:
        processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False)
        processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True)
    else:
        processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False)
        processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True)        

    if REWARD == "normal":
        sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                  policy=policy)
        sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae'])
        history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2)
        sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
        sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2)

        pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))


    elif REWARD == "noisy":
        sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                 policy=policy, processor=processor_noisy)
        sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
        history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2)
        if not SMOOTH:
            sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
        else:
            sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))

        sarsa_noisy.test(env, nb_episodes=10, visualize=False)


    elif REWARD == "surrogate":
        sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                     policy=policy, processor=processor_surrogate)
        sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])
        history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2)
        if not SMOOTH:
            sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))

        else:
            sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))

        sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
Ejemplo n.º 24
0
    def __init__(
            self,
            env="CartPole-v1",
            emulateOculus=True,
            visualize=True,
            teachingFilesPath=None,
            policyValues={
                "inner_policy": EpsGreedyQPolicy(),
                "attr": "eps",
                "value_max": 0.75,
                "value_min": .01,
                "value_test": .0,
                "nb_steps": 50000
            },
            dobotEmulation=False):
        self.policyValues = policyValues
        os.environ[
            "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
        physical_devices = tf.config.experimental.list_physical_devices('GPU')
        print("physical_devices-------------", len(physical_devices))
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        self.episodeLength = 25
        if env == "CartPole-v1":
            self.env = gym.make('CartPole-v1')
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.n
            self.saveFileName = 'sarsa_weights.h5f'
            logdir = "logs/CartPoleV1/" + datetime.now().strftime(
                "%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        elif env == "Dobot":
            self.env = dobotGym.dobotGym(emulateOculus=emulateOculus,
                                         episodeLength=self.episodeLength,
                                         visualize=visualize,
                                         teachingFilesPath=teachingFilesPath,
                                         dobotEmulation=dobotEmulation)
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.shape[0]
            self.saveFileName = 'sarsa_weights_dobot.h5f'
            logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        else:
            raise TypeError("Wrong env")

        print(
            'States', self.states
        )  # To get an idea about the number of variables affecting the environment
        print(
            'Actions', self.actions
        )  # To get an idea about the number of possible actions in the environment, do [right,left]

        #

        # episodes = 10
        # for episode in range(1, episodes + 1):
        #     # At each begining reset the game
        #     state = self.env.reset()
        #     # set done to False
        #     done = False
        #     # set score to 0
        #     score = 0
        #     # while the game is not finished
        #     while not done:
        #         # visualize each step
        #         self.env.render()
        #         # choose a random action
        #         action = random.choice([0, 1])
        #         # execute the action
        #         n_state, reward, done, info = self.env.step(action)
        #         # keep track of rewards
        #         score += reward
        #     print('episode {} score {}'.format(episode, score))

        # not working :(
        # self.agent = self.agentDDP(self.states, self.actions)
        # self.agent = self.NAFAgent(self.states, self.actions)

        # self.policy = EpsGreedyQPolicy()

        self.savingFreq = 100
        self.actualSaving = 0

        self.model = self.agentSarsa(self.states, self.actions)
        self.policy = LinearAnnealedPolicy(
            inner_policy=self.policyValues["inner_policy"],
            attr=self.policyValues["attr"],
            value_max=self.policyValues["value_max"],
            value_min=self.policyValues["value_min"],
            value_test=self.policyValues["value_test"],
            nb_steps=self.policyValues["nb_steps"])
        self.agent = SARSAAgent(model=self.model,
                                policy=self.policy,
                                nb_actions=self.actions)

        self.agent._is_graph_network = True

        def t():
            return False

        self.agent._in_multi_worker_mode = t

        self.agent.save = self.saveAgentWeights

        def lenmeh():
            return self.actions
Ejemplo n.º 25
0
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

model_fn = 'sarsa_{}_weights.h5f'.format(ENV_NAME);
if os.path.isfile(model_fn):
    sarsa.load_weights(model_fn)
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
#sarsa.fit(env, nb_steps=50000,nb_max_episode_steps=500, visualize=False, verbose=2)

# After training is done, we save the final weights.
#sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 26
0
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


model = agent(env.observation_space.shape[0], env.action_space.n)

from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy
policy = EpsGreedyQPolicy()
sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
sarsa.compile('adam', metrics=['mse'])

sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

scores = sarsa.test(env, nb_episodes=100, visualize=False)
print('Average score over 100 test games:{}'.format(
    np.mean(scores.history['episode_reward'])))

sarsa.save_weights('sarsa_weights.h5f', overwrite=True)

_ = sarsa.test(env, nb_episodes=2, visualize=True)

env.close()
Ejemplo n.º 27
0
from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy()


ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n


model = Sequential()

model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(24,activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(nb_actions, activation='linear'))


policy = EpsGreedyQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-2), metrics=['mae'])


sarsa.fit(env, nb_steps=20000, visualize=False, verbose=2)

sarsa.test(env, nb_episodes=5, visualize=True)

Ejemplo n.º 28
0
# agent network
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(32 * scale))
model.add(Activation('relu'))
model.add(Dense(16 * scale))
model.add(Activation('relu'))
model.add(Dense(8 * scale))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='softmax'))
print(model.summary())

# spcifications for the RL agent
policy = EpsGreedyQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=1000,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# compiling the model
sarsa.compile(Adam(lr=lrn_rate), metrics=['mae'])

# setting up callbacks for result collection and realtime visualization of the results through tensorboard
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
tpl = TrainEpisodeLogger()

# finally perform the training----- visualize=False enables training without visualizing the game which speeds up the training process
sarsa.fit(env,
          nb_steps=nb_steps,
          visualize=False,
          verbose=2,
# Make a neural net with 3 hidden layers
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


# Actually make a neural net with 3 hidden layers
model = agent(env.observation_space.shape[0], env.action_space.n)

policy = EpsGreedyQPolicy()
# Create a tensorflow reinforcement learning agent using the [state > action > reward] system
sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
# Choose how we calculate reward and modify the model
sarsa.compile('adam', metrics=['mse'])

# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
sarsa.load_weights('cartpolekerassarsa.h5f')

scores = sarsa.test(env, nb_episodes=10, visualize=False)
print('Average score over 10 test games: {}'.format(
    np.mean(scores.history['episode_reward'])))

sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True)
sarsa.test(env, nb_episodes=2, visualize=True)
Ejemplo n.º 30
0
class DQN:
    def __init__(
            self,
            env="CartPole-v1",
            emulateOculus=True,
            visualize=True,
            teachingFilesPath=None,
            policyValues={
                "inner_policy": EpsGreedyQPolicy(),
                "attr": "eps",
                "value_max": 0.75,
                "value_min": .01,
                "value_test": .0,
                "nb_steps": 50000
            },
            dobotEmulation=False):
        self.policyValues = policyValues
        os.environ[
            "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
        physical_devices = tf.config.experimental.list_physical_devices('GPU')
        print("physical_devices-------------", len(physical_devices))
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        self.episodeLength = 25
        if env == "CartPole-v1":
            self.env = gym.make('CartPole-v1')
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.n
            self.saveFileName = 'sarsa_weights.h5f'
            logdir = "logs/CartPoleV1/" + datetime.now().strftime(
                "%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        elif env == "Dobot":
            self.env = dobotGym.dobotGym(emulateOculus=emulateOculus,
                                         episodeLength=self.episodeLength,
                                         visualize=visualize,
                                         teachingFilesPath=teachingFilesPath,
                                         dobotEmulation=dobotEmulation)
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.shape[0]
            self.saveFileName = 'sarsa_weights_dobot.h5f'
            logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        else:
            raise TypeError("Wrong env")

        print(
            'States', self.states
        )  # To get an idea about the number of variables affecting the environment
        print(
            'Actions', self.actions
        )  # To get an idea about the number of possible actions in the environment, do [right,left]

        #

        # episodes = 10
        # for episode in range(1, episodes + 1):
        #     # At each begining reset the game
        #     state = self.env.reset()
        #     # set done to False
        #     done = False
        #     # set score to 0
        #     score = 0
        #     # while the game is not finished
        #     while not done:
        #         # visualize each step
        #         self.env.render()
        #         # choose a random action
        #         action = random.choice([0, 1])
        #         # execute the action
        #         n_state, reward, done, info = self.env.step(action)
        #         # keep track of rewards
        #         score += reward
        #     print('episode {} score {}'.format(episode, score))

        # not working :(
        # self.agent = self.agentDDP(self.states, self.actions)
        # self.agent = self.NAFAgent(self.states, self.actions)

        # self.policy = EpsGreedyQPolicy()

        self.savingFreq = 100
        self.actualSaving = 0

        self.model = self.agentSarsa(self.states, self.actions)
        self.policy = LinearAnnealedPolicy(
            inner_policy=self.policyValues["inner_policy"],
            attr=self.policyValues["attr"],
            value_max=self.policyValues["value_max"],
            value_min=self.policyValues["value_min"],
            value_test=self.policyValues["value_test"],
            nb_steps=self.policyValues["nb_steps"])
        self.agent = SARSAAgent(model=self.model,
                                policy=self.policy,
                                nb_actions=self.actions)

        self.agent._is_graph_network = True

        def t():
            return False

        self.agent._in_multi_worker_mode = t

        self.agent.save = self.saveAgentWeights

        def lenmeh():
            return self.actions

        # self.agent.__len__ = lenmeh

    def saveAgentWeights(self, path, overwrite=True):
        if self.actualSaving < self.savingFreq:
            self.actualSaving += 1
            return None
        else:
            self.actualSaving = 0
        path = 'model/checkpoint/' + datetime.now().strftime(
            "%Y%m%d-%H%M%S") + self.saveFileName
        self.agent.save_weights(path, overwrite)

    def agentSarsa(self, states, actions):
        self.model = Sequential()
        self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states)))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(24, activation='sigmoid'))
        self.model.add(Dense(12, activation='sigmoid'))
        self.model.add(Dense(actions, activation='linear'))
        self.path = fileOperation.saveToFolder(self.model.to_json(),
                                               name='modelShape',
                                               folder="model\\checkpoint")

        # , stateful=False states are resetted together after each batch.
        # model.add(Flatten(input_shape=(1, states)))
        # dot_img_file = '/model_1.png'
        # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True)
        # model.reset_states()
        return self.model

    def load(self):
        path = fileOperation.openDialogFunction(".h5f")
        self.agent.compile('adam', metrics=['mse'])
        self.agent.load_weights(path)
        self.agent.compile('adam', metrics=['mse'])

    def test(self, nb_episodes=2):
        _ = self.agent.test(self.env,
                            nb_episodes=nb_episodes,
                            visualize=self.visualize)

    def fit(self, visualize=False):
        checkpoint_filepath = 'model/checkpoint/'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=False,
            save_freq=25)
        self.agent.compile('adam', metrics=['mse'])
        self.agent.fit(
            self.env,
            nb_steps=self.policyValues["nb_steps"],
            log_interval=self.episodeLength,
            visualize=visualize,
            verbose=1,
            nb_max_start_steps=1,
            start_step_policy=self.model.reset_states,

            # callbacks=[PlotLossesKeras()])
            callbacks=[self.tensorboard_callback, model_checkpoint_callback],
        )

        scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize)
        print('Average score over 5 test games:{}'.format(
            np.mean(scores.history['episode_reward'])))
Ejemplo n.º 31
0
if nHiddenLayers > 1:
    for i in range(1, nHiddenLayers):
        hiddenLayer = Dense(nHiddenLayerNodes,
                            activation='relu',
                            kernel_initializer=weight_initializer)(hiddenLayer)

outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer)

model = Model(inputLayer, outputLayer)
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

if loadFromExisting:
    sarsa.load_weights(file_path)
else:
    startTime = time.time()
    sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1)
    endTime = time.time()
    sarsa.save_weights(file_path, overwrite=True)

# After training is done, we save the final weights.

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 32
0
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(actions))
    model.add(Activation('linear'))
    return model


model = agent(states, actions)

# Define the policy
policy = EpsGreedyQPolicy()
# Define SARSA agent by feeding it the policy and the model
sarsa = SARSAAgent(model=model,
                   nb_actions=actions,
                   nb_steps_warmup=10,
                   policy=policy)
# compile sarsa with mean squared error loss
sarsa.compile('adam', metrics=['mse'])
# train the agent for 50000 steps
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

# Evaluate the agent on 100 new episodes.
scores = sarsa.test(env, nb_episodes=100, visualize=False)
print('Average score over 100 test games: {}'.format(
    np.mean(scores.history['episode_reward'])))