Ejemplo n.º 1
            # pre_train_steps = 10000
            if total_steps > pre_train_steps:

                # initial : e & endE
                # e = 1.0
                # endE = 0.1
                if e > endE:
                    e -= stepDrop

                # every total_steps by 4, batch is run
                if total_steps % (update_freq) == 0:

                    mini_batch = memory.sample(batch_size=MINIBATCH_SIZE)

                    states = np.array([each[0] for each in mini_batch])
                    actions = np.array([each[1] for each in mini_batch])
                    rewards = np.array([each[2] for each in mini_batch])
                    dones = np.array([each[3] for each in mini_batch])
                    next_states = np.array([each[4] for each in mini_batch])

                    #Below we perform the Double-DQN update to the target Q-values
                    Q1 = myAgent.sess.run(myAgent.q_predict,feed_dict={myAgent.x:states})
                    Q2 = myAgent.sess.run(myAgent.y_target,feed_dict={myAgent.x:next_states})
                    end_multiplier = -(dones - 1)
                    doubleQ = Q2[range(MINIBATCH_SIZE),Q1]

                    # y = Discount factor 0.99
                    targetQ = rewards + (y*doubleQ * end_multiplier)
Ejemplo n.º 2
def keepMemory(memory_size=10000, pretrain_length=5000,render=False):

    #print("CartPole main start..")
    #env = gym.make('CartPole-v0')

    envs = setEnv()

    #env = envs["BreakGame"]
    env = envs["SpaceInvador"]

    # Initialize the simulation
    #observation = env.reset()
    stateCls = SteteClass(env)

    # current state == initial screen state --> nothing to active 0 action
    curr_state = stateCls.convertAndConcatenateBuffer()
    curr_state = curr_state[np.newaxis,:,:,:]

    #print("initial state size ...", state.shape)
    # Take one random step to get the pole and cart moving
    #state, reward, done, _ = env.step(env.action_space.sample())

    memory = Memory(max_size=memory_size)

    # AgentClass section
    myAgent = AgentClass(6)
    # initialize Q Network


    epsilon = 1.0
    FINAL_EPS = 0.1

    NUM_FRAMES = 3

    observation_num = 0
    alive_frame = 0
    total_reward = 0

    curr_state_actions = []

    MEMORY_FULL = False
    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):
        # Uncomment the line below to watch the simulation
        #if render:
        #    env.render()

        init_state = stateCls.convertAndConcatenateBuffer()
        action, q_values = myAgent.get_action(curr_state)

        #print("** action and q_value ... ",action, q_values)
        #return False,False,False
        #next_state, reward, done, _ = env.step(action)

        obs,rewards,done = stateCls.add_frame(action,NUM_FRAMES)

        #if observation_num % 500 == 0:
        #    print("observation_num / q_values ..",observation_num,q_values)

        if done:
            # The simulation fails so no next state
            if MEMORY_FULL:
                print("memory full.....")

            print("** rewards from done ...", total_reward)
            print("** maxium lived frame .. ", alive_frame)

            # Start new episode
            # Take one random step to get the pole and cart moving
            alive_frame = 0
            total_reward = 0

        new_state = stateCls.convertAndConcatenateBuffer()
        #memory add
        memory.add((init_state, action, rewards, done, new_state))
        total_reward += rewards

        if memory.checklength() > MIN_OBSERVATION:
            MEMORY_FULL = True
            # Sample mini-batch from memory
            # pick up m = 32
            mini_batch = memory.sample(MINIBATCH_SIZE)

            #s_batch, a_batch, r_batch, d_batch, s2_batch = memory.sample(MINIBATCH_SIZE)
            #self.deep_q.train(s_batch, a_batch, r_batch, d_batch, s2_batch, observation_num)

        observation_num += 1
        alive_frame += 1

    #print("curr action", curr_state_actions)

    #print("Total rewards from all episodes..", total_reward)

    return curr_state_actions