Esempio n. 1
0
def main():
    # initialize OpenAI Gym env and dqn agent
    env = gym.make(ENV_NAME)
    agent = DQN(env)

    for episode in range(EPISODE):
        # initialize task
        state = env.reset()
        # Train
        for step in range(STEP):
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            # Define reward for agent
            reward_agent = -1 if done else 0.1
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
            # Test every 100 episodes
            if episode % 100 == 0:
                total_reward = 0
                for i in range(TEST):
                    state = env.reset()
                    for j in range(STEP):
                        env.render()
                        action = agent.action(state)  # direct action for test
                        state, reward, done, _ = env.step(action)
                        total_reward += reward
                        if done:
                            break
                ave_reward = total_reward / TEST
                print('episode: ', episode, 'Evaluation Average Reward:',
                      ave_reward)
                if ave_reward >= 200:
                    break
Esempio n. 2
0
def main():
    env = gym.make(ENV)
    agent = DQN(env.observation_space.shape[0], env.action_space.n, logdir='/data/log/LunarLander-v2')

    for episode in xrange(EPISODE):
        state = env.reset()

        for step in xrange(STEP):
            env.render()
            action = agent.egreedy_action(state)
            next_state, reward, terminate, _ = env.step(action)
            agent.observe_action(state, action, reward, next_state, terminate)
            state = next_state

            if terminate:
                break

        if episode % 100 == 0:
            total_reward = 0
            for i in range(5):
                state = env.reset()
                for j in xrange(STEP):
                    env.render()
                    action = agent.action(state)
                    state, reward, terminate, _ = env.step(action)
                    total_reward += reward
                    if terminate:
                        break
            agent.summary(episode, total_reward / 5)
Esempio n. 3
0
def main(lr=0.001, episodeMemory=100, replaySize=64, gamma=0.95):
    np.random.seed(0)
    env = gym.make('MountainCar-v0')
    model = keras.Sequential()
    model.add(
        Dense(128, activation="relu", input_dim=3,
              kernel_initializer='normal'))
    model.add(Dense(52, activation="relu"))
    model.add(Dense(1, kernel_initializer='normal', activation="linear"))
    adam = keras.optimizers.Adam(lr=lr)
    model.compile(loss='mean_squared_error', optimizer=adam)

    #gamma = 0.95
    memorySize = 200 * episodeMemory
    dqn = DQN(model, gamma, memorySize, replaysize=replaySize, _env=env)
    dqnScore = dqnScorerMountainCar(dqn, _env=env)
    nrofEpisodes = 1001
    #nrofEpisodes = 20

    res = np.zeros(shape=(nrofEpisodes, 2))

    for episode in range(nrofEpisodes):
        env.reset()
        action = 0
        obs, _, done, _ = env.step(action)
        #if (episode % 100) == 10:
        if (episode % 100) == 10:
            print("episode ", episode)
            dqnScore.printDistance()
            #dqnScore.plot_cost_to_ßgo_mountain_car()
            #print(res[episode-1,:])
            print("--- %s seconds ---" % (time.time() - start_time))
        iter = 0
        while not done:
            iter += 1
            action = dqn.action(obs)
            new_obs, reward, done, info = env.step(action)
            if (done and (iter < 199)):
                reward = (200 - iter) / 10
                print("****Success*****", -iter)

            dqn.add(action, obs, new_obs, reward)
            obs = new_obs

            #if(episode % 100) == 10:
            #    env.render()j
        dqn.replay()
        env.reset()
        dqnScore.updateResult(iter)
        #res[episode,:] = [np.min(x[:,0]),np.max(x[:,0])]
    title = "eps_%d_mem_%d_rep_%d_gamma_%d" % (nrofEpisodes, episodeMemory,
                                               replaySize, gamma * 100)
    dqnScore.plotResults(title)
    dqnScore.plot_cost_to_go_mountain_car(title)
Esempio n. 4
0
class CartPolePlay(object):
    def __init__(
        self,
        hidden_dims,
        step_to_copy_graph=300,
        step_each_epsiode=500,
    ):
        self.step_to_copy_graph = step_to_copy_graph
        self.step_each_epsiode = step_each_epsiode

        self.dqn = DQN(4, 2, hidden_dims)
        self.env = gym.make('CartPole-v0')

    def train(self, num_train=5000):
        running_score = 0.0
        num_epsiode = 0
        num_step = 0

        for _ in range(num_train):
            state = self.env.reset()

            for t in range(self.step_each_epsiode):
                num_step += 1
                action = self.dqn.action(state)
                next_state, reward, done, _ = self.env.step(action)
                reward = -100 if done else 0.1
                self.dqn.remember(state, action, reward, done, next_state)
                state = next_state

                self.dqn.learn()

                if num_step % self.step_to_copy_graph == 0:
                    self.dqn.copy_graph()

                if done:
                    running_score += t
                    break

            num_epsiode += 1
            self.dqn.decrease_epsilon()
            if num_epsiode % 100 == 0:
                running_score /= 100
                print("Current running score is: %.2f" % running_score)
                if running_score > 195.0:
                    print("HaHa, solved in: %d" % num_epsiode)
                    return True
                running_score = 0.0
        return False

    def play(self, num_epsiode):
        total_score = 0.0
        for _ in range(num_epsiode):
            state = self.env.reset()
            while True:
                action = self.dqn.play(state)
                next_state, reward, done, _ = self.env.step(action)
                if done:
                    break
                total_score += 1
                state = next_state
        return total_score / num_epsiode

    def store(self):
        self.dqn.save('model/dqn/cartpole-v0.ckpt')
Esempio n. 5
0
            # start training / playing in this episode / game until this game over
            action = agent.egreedy_action(state)
            # perhaps env.step return an extra arg, but we ignore it with _
            next_state, reward, done, _ = env.step(action)
            reward_agent = -1 if done else 0.1
            agent.preceive(state, action, reward, next_state, done)
            state = next_state

            if done:
                # when the episode is complete
                break

        # test for every 100 episode
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(STEP):
                    env.render()
                    # the only difference than training. we directly return action produced by DQN
                    action = agent.action(state)
                    state, reward, done, _ = env.step(action=action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print("episode: {} avg reward: {}".format(episode, ave_reward))
            if ave_reward >= 200:
                break