Ejemplo n.º 1
0
def run_sarsa():

    global N_NODE_NETWORK

    env = SnakeGymDiscrete()
    nb_actions = env.action_space.n

    # initialize randomness
    np.random.seed(123)
    env.seed(123)

    # create model
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(N_NODE_NETWORK))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    sarsa = SARSAAgent(model=model,
                       nb_actions=nb_actions,
                       nb_steps_warmup=10,
                       policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)
    sarsa.save_weights('sarsa_SnakeGymDiscrete_weights.h5f', overwrite=True)

    sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 2
0
def main():
    # binance = DataReader()
    env = BinanceEnv()
    # binance.get_recent_trades()
    # env.next_observation()
    # binance_market = BinanceMarket()
    # binance_market.long()
    # time.sleep(3)
    # binance_market.close_long()
    # time.sleep(3)
    # binance_market.short()
    # time.sleep(3)
    # binance_market.close_short()
    # binance_market.update_positions()
    # print(binance_market.balance)

    # episodes = 10
    # for episode in range(1, episodes + 1):
    #     # At each begining reset the game
    #     state = env.reset()
    #     # set done to False
    #     done = False
    #     # set score to 0
    #     score = 0
    #     # while the game is not finished
    #     while not done:
    #         # visualize each step
    #         env.render()
    #         # choose a random action
    #         action = random.randint(0, 5)
    #         # execute the action
    #         n_state, reward, done, info = env.step(action)
    #         # keep track of rewards
    #         score += reward
    #     print('episode {} score {}'.format(episode, score))

    model = agent(env.observation_space.shape[0], env.action_space.n)
    policy = EpsGreedyQPolicy()
    sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
    sarsa.compile('adam', metrics=['mse', 'accuracy'])
    # sarsa.load_weights('sarsa_weights_bnb_07.h5f')
    env.is_testing = False
    sarsa.fit(env, nb_steps=100000, visualize=False, verbose=1)
    sarsa.save_weights('sarsa_weights_bnb_07_1.h5f', overwrite=True)
    # sarsa.load_weights('sarsa_weights_bnb_07_1.h5f')
    # env.simulator = False
    env.is_testing = True
    scores = sarsa.test(env, nb_episodes=1, visualize=False)
    print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward'])))

    _ = sarsa.test(env, nb_episodes=10, visualize=True)
    obs = env.reset()
    for i in range(2000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
Ejemplo n.º 3
0
def run_sarsa_agent(driver, queries, candidate_indices, tuning_config):
    # Get the environment and extract the number of actions.
    env = gym.make("udo_optimization-v0", driver=driver, queries=queries, candidate_indices=candidate_indices,
                   config=tuning_config)
    env.horizon = tuning_config['horizon']

    nb_actions = env.action_space.n
    logging.info(f"nr action: {nb_actions}")
    logging.info(f"observation space: {env.observation_space.shape}")

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(52))
    model.add(Activation('relu'))
    model.add(Dense(252))
    model.add(Activation('relu'))
    model.add(Dense(526))
    model.add(Activation('relu'))
    model.add(Dense(252))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    logging.info(model.summary())

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    # policy.select_action()
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
    sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    sarsa.fit(env, nb_steps=500, visualize=False, verbose=2)

    # After training is done, we save the final weights.
    # sarsa.save_weights('sarsa_{}_weights.h5f'.format(udo_optimization-v0), overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    sarsa.test(env, nb_episodes=5, visualize=False)
    env.print_state_summary(env.best_state)
Ejemplo n.º 4
0
def test_sarsa():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    policy = EpsGreedyQPolicy(eps=.1)
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
    sarsa.compile(Adam(lr=1e-3))

    sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
    policy.eps = 0.
    h = sarsa.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
Ejemplo n.º 5
0
def test_sarsa():
    env = TwoRoundDeterministicRewardEnv()
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.n

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Dense(16, input_shape=(1,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions, activation='linear'))

    policy = EpsGreedyQPolicy(eps=.1)
    sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
    sarsa.compile(Adam(lr=1e-3))

    sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
    policy.eps = 0.
    h = sarsa.test(env, nb_episodes=20, visualize=False)
    assert_allclose(np.mean(h.history['episode_reward']), 3.)
Ejemplo n.º 6
0
class DistopiaSARSA:
    def __init__(self,
                 env_name='distopia-initial4-v0',
                 in_path=None,
                 out_path=None,
                 terminate_on_fail=False,
                 reconstruct=False):
        self.ENV_NAME = env_name
        self.filename = self.ENV_NAME
        self.init_paths(in_path, out_path)
        self.init_env(terminate_on_fail)
        self.init_model(reconstruct)
        self.compile_agent()

    def init_paths(self, in_path, out_path):
        self.in_path = in_path  #if self.in_path != None else './'
        self.out_path = out_path if out_path != None else './'
        self.log_path = "./logs/{}".format(time.time())
        os.mkdir(self.log_path)

    def init_env(self, terminate_on_fail):
        self.env = gym.make(self.ENV_NAME)
        self.env.terminate_on_fail = terminate_on_fail
        self.env.record_path = "{}/ep_".format(self.log_path)
        self.env = gym.wrappers.Monitor(self.env, "recording", force=True)
        np.random.seed(234)
        self.env.seed(234)
        self.nb_actions = np.sum(self.env.action_space.nvec)
        self.num_actions = self.env.NUM_DIRECTIONS
        self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT

    def init_model(self, reconstruct=False):
        if self.in_path != None:
            if reconstruct == True:
                self.construct_model()
            else:
                yaml_file = open(
                    "{}/{}.yaml".format(self.in_path, self.filename), 'r')
                model_yaml = yaml_file.read()
                yaml_file.close()
                self.model = model_from_yaml(model_yaml)
            self.model.load_weights("{}/{}.h5".format(self.in_path,
                                                      self.filename))
        else:
            # Next, we build a very simple model.
            self.construct_model()
        self.save_model()
        print(self.model.summary())

    def construct_model(self):
        self.model = Sequential()
        self.model.add(
            Flatten(input_shape=(1, ) + self.env.observation_space.shape))
        self.model.add(Dense(64))
        self.model.add(Activation('relu'))
        self.model.add(Dense(64))
        self.model.add(Activation('relu'))
        # self.model.add(Dense(16))
        # self.model.add(Activation('relu'))
        self.model.add(Dense(self.nb_actions))
        self.model.add(Activation('linear'))

    def save_model(self):
        if self.out_path != None:
            with open(self.filename + ".yaml", 'w+') as yaml_file:
                yaml_file.write(self.model.to_yaml())
            self.model.save_weights('{}/{}.h5'.format(self.out_path,
                                                      self.ENV_NAME))

    def compile_agent(self):
        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        processor = DistopiaProcessor(self.num_blocks, self.num_actions)
        #memory = SequentialMemory(limit=50000, window_length=1)
        #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
        #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
        policy = BoltzmannQPolicy()
        test_policy = GreedyQPolicy()
        self.sarsa = SARSAAgent(model=self.model,
                                processor=processor,
                                nb_actions=self.nb_actions,
                                nb_steps_warmup=1000,
                                policy=policy,
                                test_policy=test_policy,
                                gamma=0.9)
        self.sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

    def train(self, max_steps=100, episodes=100):
        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        self.env._max_steps = max_steps
        #for i in range(episodes):
        self.env.current_step = 0
        n_steps = max_steps * episodes
        logger = FileLogger(
            filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME))
        self.sarsa.fit(self.env,
                       nb_steps=n_steps,
                       nb_max_episode_steps=max_steps,
                       visualize=False,
                       verbose=1,
                       callbacks=[logger])
        #self.env.reset()

        # After episode is done, we save the final weights.
        self.sarsa.save_weights('{}/{}.h5'.format(self.out_path,
                                                  self.ENV_NAME),
                                overwrite=True)

    def test(self):
        # Finally, evaluate our algorithm for 5 episodes.
        self.sarsa.test(self.env,
                        nb_episodes=5,
                        nb_max_start_steps=0,
                        visualize=True)
Ejemplo n.º 7
0
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
sarsa.fit(env, nb_steps=5000, visualize=False, verbose=2)

# After training is done, we save the final weights.
sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=False)
Ejemplo n.º 8
0
def train():
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Next, we build a very simple model.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    # SARSA does not require a memory.
    policy = BoltzmannQPolicy()
    # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False)
    # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True)
    if not SMOOTH:
        processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False)
        processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True)
    else:
        processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False)
        processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True)        

    if REWARD == "normal":
        sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                  policy=policy)
        sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae'])
        history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2)
        sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
        sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2)

        pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))


    elif REWARD == "noisy":
        sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                 policy=policy, processor=processor_noisy)
        sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
        history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2)
        if not SMOOTH:
            sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
        else:
            sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))

        sarsa_noisy.test(env, nb_episodes=10, visualize=False)


    elif REWARD == "surrogate":
        sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
                                     policy=policy, processor=processor_surrogate)
        sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])
        history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2)
        if not SMOOTH:
            sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))

        else:
            sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
            pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))

        sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
Ejemplo n.º 9
0
model_folder = './models/' + app_name + '/'
model_file = model_folder + app_name + '.h5'

try:
    # Load the model if it already exists.
    print('Loading existing model...')
    model = load_model(model_file)
    print('Model loaded.')
except OSError:
    # Build it from scratch if it doesn't.
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(64, activation='relu', use_bias=True, name='dense1'))
    model.add(Dense(64, activation='relu', use_bias=True, name='dense2'))
    model.add(Dense(64, activation='relu', use_bias=True, name='dense3'))
    model.add(Dense(nb_actions, activation='linear', name='readout'))
    print(model.summary())

policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

if fit:
    sarsa.fit(env, nb_steps=50000, visualize=visualize_fit, verbose=2)
    pathlib.Path(model_folder).mkdir(parents=True, exist_ok=True)
    model.save(model_file)

if test:
    sarsa.test(env, nb_episodes=5, visualize=visualize_test)
# Make a neural net with 3 hidden layers
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


# Actually make a neural net with 3 hidden layers
model = agent(env.observation_space.shape[0], env.action_space.n)

policy = EpsGreedyQPolicy()
# Create a tensorflow reinforcement learning agent using the [state > action > reward] system
sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
# Choose how we calculate reward and modify the model
sarsa.compile('adam', metrics=['mse'])

# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
sarsa.load_weights('cartpolekerassarsa.h5f')

scores = sarsa.test(env, nb_episodes=10, visualize=False)
print('Average score over 10 test games: {}'.format(
    np.mean(scores.history['episode_reward'])))

sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True)
sarsa.test(env, nb_episodes=2, visualize=True)
Ejemplo n.º 11
0
class DQN:
    def __init__(
            self,
            env="CartPole-v1",
            emulateOculus=True,
            visualize=True,
            teachingFilesPath=None,
            policyValues={
                "inner_policy": EpsGreedyQPolicy(),
                "attr": "eps",
                "value_max": 0.75,
                "value_min": .01,
                "value_test": .0,
                "nb_steps": 50000
            },
            dobotEmulation=False):
        self.policyValues = policyValues
        os.environ[
            "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
        physical_devices = tf.config.experimental.list_physical_devices('GPU')
        print("physical_devices-------------", len(physical_devices))
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        self.episodeLength = 25
        if env == "CartPole-v1":
            self.env = gym.make('CartPole-v1')
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.n
            self.saveFileName = 'sarsa_weights.h5f'
            logdir = "logs/CartPoleV1/" + datetime.now().strftime(
                "%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        elif env == "Dobot":
            self.env = dobotGym.dobotGym(emulateOculus=emulateOculus,
                                         episodeLength=self.episodeLength,
                                         visualize=visualize,
                                         teachingFilesPath=teachingFilesPath,
                                         dobotEmulation=dobotEmulation)
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.shape[0]
            self.saveFileName = 'sarsa_weights_dobot.h5f'
            logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        else:
            raise TypeError("Wrong env")

        print(
            'States', self.states
        )  # To get an idea about the number of variables affecting the environment
        print(
            'Actions', self.actions
        )  # To get an idea about the number of possible actions in the environment, do [right,left]

        #

        # episodes = 10
        # for episode in range(1, episodes + 1):
        #     # At each begining reset the game
        #     state = self.env.reset()
        #     # set done to False
        #     done = False
        #     # set score to 0
        #     score = 0
        #     # while the game is not finished
        #     while not done:
        #         # visualize each step
        #         self.env.render()
        #         # choose a random action
        #         action = random.choice([0, 1])
        #         # execute the action
        #         n_state, reward, done, info = self.env.step(action)
        #         # keep track of rewards
        #         score += reward
        #     print('episode {} score {}'.format(episode, score))

        # not working :(
        # self.agent = self.agentDDP(self.states, self.actions)
        # self.agent = self.NAFAgent(self.states, self.actions)

        # self.policy = EpsGreedyQPolicy()

        self.savingFreq = 100
        self.actualSaving = 0

        self.model = self.agentSarsa(self.states, self.actions)
        self.policy = LinearAnnealedPolicy(
            inner_policy=self.policyValues["inner_policy"],
            attr=self.policyValues["attr"],
            value_max=self.policyValues["value_max"],
            value_min=self.policyValues["value_min"],
            value_test=self.policyValues["value_test"],
            nb_steps=self.policyValues["nb_steps"])
        self.agent = SARSAAgent(model=self.model,
                                policy=self.policy,
                                nb_actions=self.actions)

        self.agent._is_graph_network = True

        def t():
            return False

        self.agent._in_multi_worker_mode = t

        self.agent.save = self.saveAgentWeights

        def lenmeh():
            return self.actions

        # self.agent.__len__ = lenmeh

    def saveAgentWeights(self, path, overwrite=True):
        if self.actualSaving < self.savingFreq:
            self.actualSaving += 1
            return None
        else:
            self.actualSaving = 0
        path = 'model/checkpoint/' + datetime.now().strftime(
            "%Y%m%d-%H%M%S") + self.saveFileName
        self.agent.save_weights(path, overwrite)

    def agentSarsa(self, states, actions):
        self.model = Sequential()
        self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states)))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(24, activation='sigmoid'))
        self.model.add(Dense(12, activation='sigmoid'))
        self.model.add(Dense(actions, activation='linear'))
        self.path = fileOperation.saveToFolder(self.model.to_json(),
                                               name='modelShape',
                                               folder="model\\checkpoint")

        # , stateful=False states are resetted together after each batch.
        # model.add(Flatten(input_shape=(1, states)))
        # dot_img_file = '/model_1.png'
        # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True)
        # model.reset_states()
        return self.model

    def load(self):
        path = fileOperation.openDialogFunction(".h5f")
        self.agent.compile('adam', metrics=['mse'])
        self.agent.load_weights(path)
        self.agent.compile('adam', metrics=['mse'])

    def test(self, nb_episodes=2):
        _ = self.agent.test(self.env,
                            nb_episodes=nb_episodes,
                            visualize=self.visualize)

    def fit(self, visualize=False):
        checkpoint_filepath = 'model/checkpoint/'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=False,
            save_freq=25)
        self.agent.compile('adam', metrics=['mse'])
        self.agent.fit(
            self.env,
            nb_steps=self.policyValues["nb_steps"],
            log_interval=self.episodeLength,
            visualize=visualize,
            verbose=1,
            nb_max_start_steps=1,
            start_step_policy=self.model.reset_states,

            # callbacks=[PlotLossesKeras()])
            callbacks=[self.tensorboard_callback, model_checkpoint_callback],
        )

        scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize)
        print('Average score over 5 test games:{}'.format(
            np.mean(scores.history['episode_reward'])))
Ejemplo n.º 12
0
class KerasSarsaAgent(AbstractAgent):
    def __init__(self, env, timesteps_per_episode=10001):
        super().__init__(env, timesteps_per_episode)
        self.num_episodes = 400
        self.evaluating = False
        self.action_size = env.action_space.n
        self.state_size = env.num_states
        self.model = self._build_compile_model()
        self.agent = SARSAAgent(model=self.model,
                                nb_actions=self.action_size,
                                policy=EpsGreedyQPolicy())

    def run(self) -> {str: float}:
        """
        The agent's training method.
        Returns: a dictionary - {"episode_reward_mean": __, "episode_reward_min": __, "episode_reward_max": __,
        "episode_len_mean": __}
        """
        self.agent.compile(Adam(lr=0.001), metrics=["mse"])
        history = self.agent.fit(self.env,
                                 nb_steps=ITER_NUM,
                                 visualize=False,
                                 verbose=1)
        if len(history.history) > 0:
            episode_reward = history.history["episode_reward"]
            nb_episode_steps = history.history["nb_episode_steps"]
        else:
            episode_reward, nb_episode_steps = [0], [0]  # TODO - placeholder
        result = {
            EPISODE_REWARD_MEAN: np.array(episode_reward),
            EPISODE_STEP_NUM_MEAN: np.array(nb_episode_steps),
            EPISODE_REWARD_MIN: np.empty([]),
            EPISODE_REWARD_MAX: np.empty([]),
            EPISODE_VARIANCE: np.empty([])
        }
        return result

    def _build_compile_model(self):
        model = Sequential()
        # model.add(Flatten(input_shape=(1, self.action_size)))
        model.add(Embedding(self.state_size, 10, input_length=1))  # 600000
        model.add(Reshape((10, )))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        return model

    def compute_action(self, state) -> int:
        """
        Computes the best action from a given state.
        Returns: a int that represents the best action.
        """
        state = np.array([[state]])
        return int(np.argmax(self.model.predict(state)))

    def stop_episode(self):
        pass

    def episode_callback(self, state, action, reward, next_state, terminated):
        pass

    def evaluate(self, visualize=False):
        self.agent.test(self.env,
                        nb_episodes=5,
                        visualize=visualize,
                        nb_max_episode_steps=60)

    def replay_experiences(self):
        pass
Ejemplo n.º 13
0
                     verbose=2,
                     nb_max_episode_steps=500,
                     callbacks=[tb])  # 20s episodes

    # print history
    print("history contents : ",
          hist.history.keys())  # episode_reward, nb_episode_steps, nb_steps
    # summarize history for accuracy
    import matplotlib.pyplot as plt
    plt.plot(hist.history['episode_reward'])
    plt.plot(hist.history['nb_episode_steps'])
    plt.title('learning')
    plt.xlabel('episode')
    plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left')
    plt.show()

    # save history
    with open('_experiments/history_' + filename + '.pickle', 'wb') as handle:
        pickle.dump(hist.history, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # After training is done, we save the final weights.
    sarsa.save_weights('h5f_files/dqn_{}_weights.h5f'.format(filename),
                       overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    sarsa.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500)

if mode == 'test':
    sarsa.load_weights('h5f_files/dqn_{}_weights.h5f'.format(filename))
    sarsa.test(env, nb_episodes=10, visualize=True,
               nb_max_episode_steps=400)  # 40 seconds episodes
Ejemplo n.º 14
0
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 15
0
env = gym.make('CartPole-v1')
states = env.observation_space.shape[0]
actions = env.action_space.n


def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


model = agent(env.observation_space.shape[0], env.action_space.n)

from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy

sarsa = SARSAAgent(model=model,
                   policy=EpsGreedyQPolicy(),
                   nb_actions=env.action_space.n)
# sarsa.compile('adam', metrics = ['mse'])
# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
# scores = sarsa.test(env, nb_episodes = 100, visualize= True)

# sarsa.save_weights('1-sarsa_weights.h5f', overwrite=True)
sarsa.load_weights('1-sarsa_weights.h5f')
_ = sarsa.test(env, nb_episodes=100, visualize=True)
print('Average score over 100 test games:{}'.format(
    np.mean(_.history['episode_reward'])))
Ejemplo n.º 16
0
import pickle
pickle.dump(metrics, open('sarsa_%d_%s_metrics.p' % (scale, ENV_NAME), "wb"))

# load model for testing
sarsa.load_weights('/home/am/Desktop/set_tests/final/sarsa_%d_%s_weights.h5f' %
                   (scale, ENV_NAME))

# setting up monitoring tools to record the testing episodes
from gym import monitoring
from gym.wrappers import Monitor


def episode5(episode_id):
    if episode_id < 5:
        return True
    else:
        return False


#rec = StatsRecorder(env,"sarsa_1")
#rec.capture_frame()

temp = '/home/am/Desktop/set_tests/final/sarsa_%d_%s' % (scale, ENV_NAME)
env = Monitor(env, temp, force=True, video_callable=episode5)

# testing
sarsa.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=2000)

env.close()
results = monitoring.load_results(temp)
Ejemplo n.º 17
0
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(actions))
    model.add(Activation('linear'))
    return model


model = agent(states, actions)

# Define the policy
policy = EpsGreedyQPolicy()
# Define SARSA agent by feeding it the policy and the model
sarsa = SARSAAgent(model=model,
                   nb_actions=actions,
                   nb_steps_warmup=10,
                   policy=policy)
# compile sarsa with mean squared error loss
sarsa.compile('adam', metrics=['mse'])
# train the agent for 50000 steps
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

# Evaluate the agent on 100 new episodes.
scores = sarsa.test(env, nb_episodes=100, visualize=False)
print('Average score over 100 test games: {}'.format(
    np.mean(scores.history['episode_reward'])))
Ejemplo n.º 18
0
                            kernel_initializer=weight_initializer)(hiddenLayer)

outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer)

model = Model(inputLayer, outputLayer)
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

if loadFromExisting:
    sarsa.load_weights(file_path)
else:
    startTime = time.time()
    sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1)
    endTime = time.time()
    sarsa.save_weights(file_path, overwrite=True)

# After training is done, we save the final weights.

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)

if not loadFromExisting:
    print("Time taken to trian: {0}".format(endTime - startTime))
Ejemplo n.º 19
0
        model = Sequential()
        model.add(Flatten(input_shape=(1, states)))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(actions, activation="linear"))
        return model

    model = agent(env.observation_space.shape[0], env.action_space.n)

    policy = EpsGreedyQPolicy()

    sarsa = SARSAAgent(model=model,
                       policy=policy,
                       nb_actions=env.action_space.n)

    sarsa.compile("adam", metrics=["mse"])

    sarsa.fit(env, nb_steps=10000, visualize=False, verbose=0)

    scores = sarsa.test(env, nb_episodes=50, visualize=True)

    print('Average score over 100 test games:{}'.format(
        np.mean(scores.history['episode_reward'])))

    #sarsa.save_weights('sarsa_weights.h5f', overwrite=True) # save trained weights

    # sarsa.load_weights('sarsa_weights.h5f') # can be used to load trained weights

    _ = sarsa.test(env, nb_episodes=50, visualize=False)