# Make a neural net with 3 hidden layers
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


# Actually make a neural net with 3 hidden layers
model = agent(env.observation_space.shape[0], env.action_space.n)

policy = EpsGreedyQPolicy()
# Create a tensorflow reinforcement learning agent using the [state > action > reward] system
sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)
# Choose how we calculate reward and modify the model
sarsa.compile('adam', metrics=['mse'])

# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
sarsa.load_weights('cartpolekerassarsa.h5f')

scores = sarsa.test(env, nb_episodes=10, visualize=False)
print('Average score over 10 test games: {}'.format(
    np.mean(scores.history['episode_reward'])))

sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True)
sarsa.test(env, nb_episodes=2, visualize=True)
Ejemplo n.º 2
0
                            kernel_initializer=weight_initializer)(hiddenLayer)

outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer)

model = Model(inputLayer, outputLayer)
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

if loadFromExisting:
    sarsa.load_weights(file_path)
else:
    startTime = time.time()
    sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1)
    endTime = time.time()
    sarsa.save_weights(file_path, overwrite=True)

# After training is done, we save the final weights.

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)

if not loadFromExisting:
    print("Time taken to trian: {0}".format(endTime - startTime))
Ejemplo n.º 3
0
env = gym.make('CartPole-v1')
states = env.observation_space.shape[0]
actions = env.action_space.n


def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


model = agent(env.observation_space.shape[0], env.action_space.n)

from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy

sarsa = SARSAAgent(model=model,
                   policy=EpsGreedyQPolicy(),
                   nb_actions=env.action_space.n)
# sarsa.compile('adam', metrics = ['mse'])
# sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1)
# scores = sarsa.test(env, nb_episodes = 100, visualize= True)

# sarsa.save_weights('1-sarsa_weights.h5f', overwrite=True)
sarsa.load_weights('1-sarsa_weights.h5f')
_ = sarsa.test(env, nb_episodes=100, visualize=True)
print('Average score over 100 test games:{}'.format(
    np.mean(_.history['episode_reward'])))
Ejemplo n.º 4
0
    for key in dc:
        re.append(dc[key])
    return re


tt = dict_to_list(tpl.rewards_mean)
mm = np.array(tt[:-1])
kk = dict_to_list(tpl.metrics_at_end)
jj = np.array(kk[:-1])
metrics = np.column_stack((mm, jj))

import pickle
pickle.dump(metrics, open('sarsa_%d_%s_metrics.p' % (scale, ENV_NAME), "wb"))

# load model for testing
sarsa.load_weights('/home/am/Desktop/set_tests/final/sarsa_%d_%s_weights.h5f' %
                   (scale, ENV_NAME))

# setting up monitoring tools to record the testing episodes
from gym import monitoring
from gym.wrappers import Monitor


def episode5(episode_id):
    if episode_id < 5:
        return True
    else:
        return False


#rec = StatsRecorder(env,"sarsa_1")
#rec.capture_frame()
Ejemplo n.º 5
0
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# SARSA does not require a memory.
policy = BoltzmannQPolicy()
sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

model_fn = 'sarsa_{}_weights.h5f'.format(ENV_NAME);
if os.path.isfile(model_fn):
    sarsa.load_weights(model_fn)
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
#sarsa.fit(env, nb_steps=50000,nb_max_episode_steps=500, visualize=False, verbose=2)

# After training is done, we save the final weights.
#sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
sarsa.test(env, nb_episodes=5, visualize=True)
Ejemplo n.º 6
0
class DQN:
    def __init__(
            self,
            env="CartPole-v1",
            emulateOculus=True,
            visualize=True,
            teachingFilesPath=None,
            policyValues={
                "inner_policy": EpsGreedyQPolicy(),
                "attr": "eps",
                "value_max": 0.75,
                "value_min": .01,
                "value_test": .0,
                "nb_steps": 50000
            },
            dobotEmulation=False):
        self.policyValues = policyValues
        os.environ[
            "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
        physical_devices = tf.config.experimental.list_physical_devices('GPU')
        print("physical_devices-------------", len(physical_devices))
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        self.episodeLength = 25
        if env == "CartPole-v1":
            self.env = gym.make('CartPole-v1')
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.n
            self.saveFileName = 'sarsa_weights.h5f'
            logdir = "logs/CartPoleV1/" + datetime.now().strftime(
                "%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        elif env == "Dobot":
            self.env = dobotGym.dobotGym(emulateOculus=emulateOculus,
                                         episodeLength=self.episodeLength,
                                         visualize=visualize,
                                         teachingFilesPath=teachingFilesPath,
                                         dobotEmulation=dobotEmulation)
            self.states = self.env.observation_space.shape[0]
            self.actions = self.env.action_space.shape[0]
            self.saveFileName = 'sarsa_weights_dobot.h5f'
            logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S")
            self.tensorboard_callback = keras.callbacks.TensorBoard(
                log_dir=logdir)
            self.visualize = True
        else:
            raise TypeError("Wrong env")

        print(
            'States', self.states
        )  # To get an idea about the number of variables affecting the environment
        print(
            'Actions', self.actions
        )  # To get an idea about the number of possible actions in the environment, do [right,left]

        #

        # episodes = 10
        # for episode in range(1, episodes + 1):
        #     # At each begining reset the game
        #     state = self.env.reset()
        #     # set done to False
        #     done = False
        #     # set score to 0
        #     score = 0
        #     # while the game is not finished
        #     while not done:
        #         # visualize each step
        #         self.env.render()
        #         # choose a random action
        #         action = random.choice([0, 1])
        #         # execute the action
        #         n_state, reward, done, info = self.env.step(action)
        #         # keep track of rewards
        #         score += reward
        #     print('episode {} score {}'.format(episode, score))

        # not working :(
        # self.agent = self.agentDDP(self.states, self.actions)
        # self.agent = self.NAFAgent(self.states, self.actions)

        # self.policy = EpsGreedyQPolicy()

        self.savingFreq = 100
        self.actualSaving = 0

        self.model = self.agentSarsa(self.states, self.actions)
        self.policy = LinearAnnealedPolicy(
            inner_policy=self.policyValues["inner_policy"],
            attr=self.policyValues["attr"],
            value_max=self.policyValues["value_max"],
            value_min=self.policyValues["value_min"],
            value_test=self.policyValues["value_test"],
            nb_steps=self.policyValues["nb_steps"])
        self.agent = SARSAAgent(model=self.model,
                                policy=self.policy,
                                nb_actions=self.actions)

        self.agent._is_graph_network = True

        def t():
            return False

        self.agent._in_multi_worker_mode = t

        self.agent.save = self.saveAgentWeights

        def lenmeh():
            return self.actions

        # self.agent.__len__ = lenmeh

    def saveAgentWeights(self, path, overwrite=True):
        if self.actualSaving < self.savingFreq:
            self.actualSaving += 1
            return None
        else:
            self.actualSaving = 0
        path = 'model/checkpoint/' + datetime.now().strftime(
            "%Y%m%d-%H%M%S") + self.saveFileName
        self.agent.save_weights(path, overwrite)

    def agentSarsa(self, states, actions):
        self.model = Sequential()
        self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states)))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(42, activation='sigmoid'))
        self.model.add(Dense(24, activation='sigmoid'))
        self.model.add(Dense(12, activation='sigmoid'))
        self.model.add(Dense(actions, activation='linear'))
        self.path = fileOperation.saveToFolder(self.model.to_json(),
                                               name='modelShape',
                                               folder="model\\checkpoint")

        # , stateful=False states are resetted together after each batch.
        # model.add(Flatten(input_shape=(1, states)))
        # dot_img_file = '/model_1.png'
        # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True)
        # model.reset_states()
        return self.model

    def load(self):
        path = fileOperation.openDialogFunction(".h5f")
        self.agent.compile('adam', metrics=['mse'])
        self.agent.load_weights(path)
        self.agent.compile('adam', metrics=['mse'])

    def test(self, nb_episodes=2):
        _ = self.agent.test(self.env,
                            nb_episodes=nb_episodes,
                            visualize=self.visualize)

    def fit(self, visualize=False):
        checkpoint_filepath = 'model/checkpoint/'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=False,
            save_freq=25)
        self.agent.compile('adam', metrics=['mse'])
        self.agent.fit(
            self.env,
            nb_steps=self.policyValues["nb_steps"],
            log_interval=self.episodeLength,
            visualize=visualize,
            verbose=1,
            nb_max_start_steps=1,
            start_step_policy=self.model.reset_states,

            # callbacks=[PlotLossesKeras()])
            callbacks=[self.tensorboard_callback, model_checkpoint_callback],
        )

        scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize)
        print('Average score over 5 test games:{}'.format(
            np.mean(scores.history['episode_reward'])))
Ejemplo n.º 7
0
                     verbose=2,
                     nb_max_episode_steps=500,
                     callbacks=[tb])  # 20s episodes

    # print history
    print("history contents : ",
          hist.history.keys())  # episode_reward, nb_episode_steps, nb_steps
    # summarize history for accuracy
    import matplotlib.pyplot as plt
    plt.plot(hist.history['episode_reward'])
    plt.plot(hist.history['nb_episode_steps'])
    plt.title('learning')
    plt.xlabel('episode')
    plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left')
    plt.show()

    # save history
    with open('_experiments/history_' + filename + '.pickle', 'wb') as handle:
        pickle.dump(hist.history, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # After training is done, we save the final weights.
    sarsa.save_weights('h5f_files/dqn_{}_weights.h5f'.format(filename),
                       overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    sarsa.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500)

if mode == 'test':
    sarsa.load_weights('h5f_files/dqn_{}_weights.h5f'.format(filename))
    sarsa.test(env, nb_episodes=10, visualize=True,
               nb_max_episode_steps=400)  # 40 seconds episodes
Ejemplo n.º 8
0
                       nb_steps_warmup=3,
                       policy=policy)
    agent.compile(Adam(lr=1e-3), metrics=['mae'])
    agent.reset_states()

    #=========================================================================#

    # re-use weights if possible
    if (os.path.isfile(inv_weights_fname)):
        inverse_model.load_weights(inv_weights_fname)

    if (os.path.isfile(fwd_weights_fname)):
        forward_model.load_weights(fwd_weights_fname)

    if (os.path.isfile(agent_weights_fname)):
        agent.load_weights(agent_weights_fname)
#    else:
# FIXME: this bit is necessary or agent does nothing???
# probably initializes values or something
#    agent.fit(env, nb_steps=20, visualize=False)
    agent.training = True  # IMPORTANT!!! or it doesn't learn

    #=========================================================================#

    episode_count = 1000
    reward = 0
    done = False

    for i in range(episode_count):
        print "episode=%d" % i
        obs_now = env.reset()
Ejemplo n.º 9
0
#memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicyC4(env)
# dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
#                target_model_update=1e-2, policy=policy,test_policy=policy)
# dqn.compile(Adam(lr=1e-3), metrics=['mae'])
sarsa = SARSAAgent(model=model,
                   nb_actions=nb_actions,
                   nb_steps_warmup=10,
                   policy=policy,
                   test_policy=policy)
sarsa.compile(Adam(lr=1e-3), metrics=['mae'])

# Load weights
try:
    #dqn.load_weights(weights_filename)
    sarsa.load_weights(weights_filename)
except OSError:
    print("no saved weights found")
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
#dqn.fit(env, nb_steps=5000000, visualize=False, verbose=2)
sarsa.fit(env,
          nb_steps=50000,
          visualize=False,
          verbose=1,
          callbacks=[WandbCallback()])

# After training is done, we save the final weights.
#dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)