Esempio n. 1
0
def run2(finallist):
    env1 = make_atari("PongNoFrameskip-v4")
    env1 = wrap_deepmind(env1, frame_stack=True, scale=True, episode_life=True)
    globalagent1 = acagent()
    while True:
        if len(finallist) > 0:
            globalagent1.model.set_weights(finallist[0])
            state = env1.reset()
            state = np.reshape(state, [1, 84, 84, 4])
            for i in range(2000000):
                env1.render()
                at = globalagent1.choose_action(state)
                next_state, reward, done, _ = env1.step(at)
                next_state = np.reshape(next_state, [1, 84, 84, 4])
                state = next_state
                time.sleep(0.02)
                if done:
                    print(globalagent1.print_action_prob)
                    break
Esempio n. 2
0
def run1(e, lock,finallist):

    env = make_atari("PongNoFrameskip-v4")
    # Warp the frames, grey scale, stake four frame and scale to smaller ratio
    env = wrap_deepmind(env, frame_stack=True, scale=True, episode_life=True)

    # Warp the frames, grey scale, stake four frame and scale to smaller ratio

    agent = acagent()
    entropymulti = 0.01
    e.clear()
    done = 0
    state = env.reset()
    state = np.reshape(state, [1, 84, 84, 4])

    while done == 0:
        # start_time = time.time()
        e.clear()

        d_theta = tf.convert_to_tensor(0)
        # d_thetav = tf.convert_to_tensor(0.0)

        agent.model.set_weights(finallist[0])

        statelist = []
        rewardlist = []
        actionlist = []
        donelist = []
        nextstatelist = []
        tstepcount = 0
        for t in range(200000):
            at = agent.choose_action(state)

            next_state, reward, done, _ = env.step(at)
            next_state = np.reshape(next_state, [1, 84, 84, 4])
            statelist.append(state)
            actionlist.append(at)
            nextstatelist.append(next_state)

            rewardlist.append(reward)

            donelist.append(done)
            state = next_state
            tstepcount = tstepcount + 1

            if tstepcount >= 30:
                action, critic = agent.model(statelist[-1])
                R = critic
                break
            if done:
                # print(np.sum(rewardlist))
                # print(np.average(np.sum(rewardlist)))
                R = 0.0

                break

        # tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(agent.PImodel(statelist[i])[0], agent.PImodel(statelist[i])[0])
        tlist = list(range(len(statelist)))
        tlist.reverse()

        for i in tlist:
            R = rewardlist[i] + agent.gamma * R

            with tf.GradientTape(persistent=True) as tape:
                actor, critic = agent.model(statelist[i])

                y_predpi = tf.math.log(tf.maximum(actor[0][actionlist[i]-1], 1e-4)) * (-critic[0] + R)
                y_predv = tf.square(tf.math.subtract(R, critic[0]))

                total_loss = -y_predpi + 0.5 * y_predv + entropymulti * tf.reduce_sum(
                    tf.math.log(tf.maximum(actor[0], 1e-4)) * tf.maximum(actor[0], 1e-4))

            #
            d_theta = np.add(d_theta, tape.gradient(total_loss, agent.model.trainable_variables))

        lock.acquire()
        finallist.append(d_theta)
        lock.release()
        e.wait()
Esempio n. 3
0
def worker(e, weights_list, experience_list):
    e.clear()

    env = make_atari("PongNoFrameskip-v4")
    # Warp the frames, grey scale, stake four frame and scale to smaller ratio
    env = wrap_deepmind(env, frame_stack=True, scale=True, episode_life=True)
    agent = acagent()
    done = 0
    state = env.reset()
    state = np.reshape(state, [1, 84, 84, 4])

    while done == 0:

        agent.model.set_weights(weights_list[0])
        statelist = []
        rewardlist = []
        actionlist = []
        donelist = []
        nextstatelist = []
        tstepcount = 0
        for t in range(200000):
            at = agent.choose_action(state)

            next_state, reward, done, _ = env.step(at)
            next_state = np.reshape(next_state, [1, 84, 84, 4])
            statelist.append(state)
            actionlist.append(at)
            nextstatelist.append(next_state)

            rewardlist.append(reward)

            donelist.append(done)
            state = next_state
            tstepcount = tstepcount + 1

            if tstepcount >= 100:
                action, critic = agent.model(statelist[-1])
                R = critic
                break
            if done:
                R = 0.0
                break

        tlist = list(range(len(statelist)))
        tlist.reverse()
        experience = []
        for i in tlist:
            R = rewardlist[i] + agent.gamma * R
            experience.append((actionlist[i], statelist[i], donelist[i], R))

        experience_list.append(experience)
        #     #
        #     # with tf.GradientTape(persistent=True) as tape:
        #     #     actor, critic = agent.model(statelist[i])
        #     #
        #     #     y_predpi = tf.math.log(tf.maximum(actor[0][actionlist[i]-1], 1e-6)) * (-critic[0] + R)
        #     #     y_predv = tf.square(tf.math.subtract(R, critic[0]))
        #     #
        #     #     total_loss = -y_predpi + 0.5 * y_predv + entropymulti * tf.reduce_sum(
        #     #         tf.math.log(tf.maximum(actor[0], 1e-6)) * tf.maximum(actor[0], 1e-6))
        #     #
        #     # #
        #     # d_theta = np.add(d_theta, tape.gradient(total_loss, agent.model.trainable_variables))
        #
        # finallist.append(d_theta)
        e.wait()