Beispiel #1
0
def main(lr=0.001, episodeMemory=100, replaySize=64, gamma=0.95):
    np.random.seed(0)
    env = gym.make('MountainCar-v0')
    model = keras.Sequential()
    model.add(
        Dense(128, activation="relu", input_dim=3,
              kernel_initializer='normal'))
    model.add(Dense(52, activation="relu"))
    model.add(Dense(1, kernel_initializer='normal', activation="linear"))
    adam = keras.optimizers.Adam(lr=lr)
    model.compile(loss='mean_squared_error', optimizer=adam)

    #gamma = 0.95
    memorySize = 200 * episodeMemory
    dqn = DQN(model, gamma, memorySize, replaysize=replaySize, _env=env)
    dqnScore = dqnScorerMountainCar(dqn, _env=env)
    nrofEpisodes = 1001
    #nrofEpisodes = 20

    res = np.zeros(shape=(nrofEpisodes, 2))

    for episode in range(nrofEpisodes):
        env.reset()
        action = 0
        obs, _, done, _ = env.step(action)
        #if (episode % 100) == 10:
        if (episode % 100) == 10:
            print("episode ", episode)
            dqnScore.printDistance()
            #dqnScore.plot_cost_to_ßgo_mountain_car()
            #print(res[episode-1,:])
            print("--- %s seconds ---" % (time.time() - start_time))
        iter = 0
        while not done:
            iter += 1
            action = dqn.action(obs)
            new_obs, reward, done, info = env.step(action)
            if (done and (iter < 199)):
                reward = (200 - iter) / 10
                print("****Success*****", -iter)

            dqn.add(action, obs, new_obs, reward)
            obs = new_obs

            #if(episode % 100) == 10:
            #    env.render()j
        dqn.replay()
        env.reset()
        dqnScore.updateResult(iter)
        #res[episode,:] = [np.min(x[:,0]),np.max(x[:,0])]
    title = "eps_%d_mem_%d_rep_%d_gamma_%d" % (nrofEpisodes, episodeMemory,
                                               replaySize, gamma * 100)
    dqnScore.plotResults(title)
    dqnScore.plot_cost_to_go_mountain_car(title)
Beispiel #2
0
    Q = model(obs, save=False)
    #print Q[0]
    if epsilon > rand() or step < 100:
        action = randint(0, 2)
    else:
        action = np.argmax(Q[0])
    epsilon -= 2e-4
    if epsilon < 0.:
        epsilon = 0.
    obs, reward, done, _ = env.step(action)
    reward = 0.
    if done:
        reward = -1.
        episode += 1

    Memory.add(last_obs, action, reward, obs, done)

    if done:
        obs = env.reset()
    last_obs = deepcopy(obs)
    if done and episode % 100 == 0:
        print 'episode:', episode, 'step:', step, 'eps:', epsilon, 'ave:', time / 100., 'Q:', Q[
            0]
        time = 0.

    #t = deepcopy(Q)
    if step < 100:
        continue
    sample = [Memory.ReplayMemory[(Memory.count - 1) % 10**6]]  #sample(16)
    #sample = []
    #for i in range(10):