def start(GAME_NAME, MAX_EPISODE):
    env = gym.make(GAME_NAME)  # create enviornment
    actor = Actor(env.observation_space, env.action_space)  # create actor
    critic = Critic(env.observation_space, env.action_space)  # create critic
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []
    MAX_EPISODE = MAX_EPISODE
    RENDER = False
    MAX_EP_STEPS = 1000
    #DISPLAY_REWARD_THRESHOLD=200

    #print ("begin.\n\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        critic.reset()
        actor.reset()
        track_r = []
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)
            #if done: r = -20             # Penalty if die
            track_r.append(r)

            td_error, abs_error = critic.learn(s, r, s_)  # Critic Learn
            actor.learn(s, a, td_error)  # Actor Learn

            s = s_

            #print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())

            #env.render()

            if done or t >= MAX_EP_STEPS:  # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
def run():
    # build environment using openai gym
    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    sess = tf.Session()
    # create an actor and critic
    actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor)
    critic = Critic(sess, n_features=n_features, lr=lr_critic)
    # build the two networks
    actor.build_net()
    critic.build_net()

    sess.run(tf.global_variables_initializer())

    # tf.summary.FileWriter("",sess.graph)
    # count steps
    step = 0
    # env.render()
    for episode in range(n_episodes):
        s = env.reset()
        # comment the render() to speed up
        # env.render()
        # s returned by gym is a vector, we need to transform it into a matrix
        s = s[np.newaxis, :]
        a = actor.choose_action(s)
        while (True):
            step += 1
            # a new transition
            s_, r, done, info = env.step(a)
            # in order to let s_ add one rank(matrix)
            s_ = s_[np.newaxis, :]
            a_ = actor.choose_action(s_)
            # calculate td_error
            td_error = critic.learn(s, s_)
            actor.learn(s, a, td_error)
            s = s_

            if step % 500 == 0:
                print(step, s_)

            if done:
                print('arrive')
                print(s_)
                break
Example #3
0
class NetworkAC(object):
    """docstring for NetworkAC."""
    def __init__(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        self.actor = Actor(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi])
        self.critic = Critic(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START)
        self.sess.run(tf.global_variables_initializer())

    def train(self, x, a, y, r):
        td_error = self.critic.learn(x, r, y)  # gradient = grad[r + gamma * V(y_) - V(x_)]
        self.actor.learn(x, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

    def predict(self, state):
        action = self.actor.choose_action(state)
        value = self.critic.predict(state)
        return action, value
Example #4
0
def start_p(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    env = gym.make(GAME_NAME)
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []

    MAX_EPISODE = 200
    RENDER = False
    MAX_EP_STEPS = 1000
    DISPLAY_REWARD_THRESHOLD = 200
    BATCH_SIZE = BATCH_SIZE
    MEMORY_CAPACITY = MEMORY_CAPACITY
    replay_memory = SumTreeMemoryBuffer(MEMORY_CAPACITY)

    #print "begin.\n\n"
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_ = []  # clean critic loss buffer
        actor._loss_ = []  # clean actor loss buffer
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            ##if done: r = -20    #  Penalty if die

            track_r.append(r)

            # ACER: Critic Actor with Experience Replay
            if not done:
                transition = np.hstack((s, a, r, s_))
                replay_memory.save(transition)  # Save non-final transition

            #print len(replay_memory)
            #print replay_memory.data
            #print replay_memory.gettree
            if len(replay_memory
                   ) >= BATCH_SIZE:  # memory capacity into batch size
                tree_idx, batch, ISWeights = replay_memory.sample(
                    BATCH_SIZE)  # Sample from memory
                s_b = np.asarray(batch[-1, 0:8])  # state
                s_b_n = np.asarray(batch[-1, 10:18])  # next state
                a_b = np.asarray(batch[-1, 8])  # action
                r_b = np.asarray(batch[-1, 9])  # reward

                # print("tree_idx:   " + str(tree_idx))
                #print(ISWeights)

                td_error, abs_error = critic.learn(s_b, r_b, s_b_n,
                                                   ISWeights)  # Critic Learn
                replay_memory.batch_update(tree_idx,
                                           abs_error)  # Update T priority
                actor.learn(s_b, a_b, td_error)  # Actor Learn
                # print("rd_error:     " + str(td_error))
                print("abs_error:   " + str(abs_error))

            s = s_

            # print "... in episode (%d) step (%d)" % (i_episode+1,t)
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:
                ep_rs_sum = sum(track_r)
                # if 'running_reward' not in globals():
                #     running_reward = ep_rs_sum
                # else:
                #     running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #  if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(
                    running_reward_avg)  ## draw average reward here
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                # print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
Example #5
0
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    #print ("make enviornment")
    env = gym.make(GAME_NAME)
    #print ("create actor, critic")
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi=[]
    durations_per_epi=[]
    l_A=[]
    l_C=[]

    MAX_EPISODE = 500
    RENDER = False
    MAX_EP_STEPS= 1000
    DISPLAY_REWARD_THRESHOLD=200
    BATCH_SIZE=BATCH_SIZE
    MEMORY_CAPACITY=MEMORY_CAPACITY
    replay_memory = ReplayMemory(MEMORY_CAPACITY)

    #print ("begin.\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_=[]
        actor._loss_=[]
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            ##if done: r = -20    #  Penalty if die

            track_r.append(r)

            # ACER learn from experience
            if not done:
                replay_memory.save(s, a, r, s_)   # Save non-final transition into memeory

            if len(replay_memory) >= BATCH_SIZE:

                transitions = replay_memory.sample(BATCH_SIZE)   # Sample from memory for training
                batch = Transition(*zip(*transitions))

                s_b = np.asarray(batch.state)
                s_b_n = np.asarray(batch.next_state)
                a_b = np.asarray(batch.action).reshape(BATCH_SIZE, 1)
                r_b = np.asarray(batch.reward).reshape(BATCH_SIZE, 1)

                td_error, abs_error  = critic.learn(s_b, r_b, s_b_n)    # Critic Learn
                actor.learn(s_b, a_b, td_error)       # Actor Learn
            ################## ################

            s = s_

            ##print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:   # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum/float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000):
    #print ("make enviornment")
    env = gym.make(GAME_NAME)
    #print ("create actor, critic")
    actor = Actor(env.observation_space, env.action_space)
    critic = Critic(env.observation_space, env.action_space)
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []

    MAX_EPISODE = 200
    RENDER = False
    MAX_EP_STEPS = 1000
    DISPLAY_REWARD_THRESHOLD = 200
    BATCH_SIZE = BATCH_SIZE
    MEMORY_CAPACITY = MEMORY_CAPACITY
    replay_memory_1 = ReplayMemory(MEMORY_CAPACITY)
    replay_memory_2 = ReplayMemory(MEMORY_CAPACITY)
    f_1 = BATCH_SIZE / 2  # define fraction for 2 buckets
    f_2 = BATCH_SIZE / 2

    #print ("begin.\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        track_r = []
        critic._v_ = []
        actor._loss_ = []
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            track_r.append(r)

            if not done:
                replay_memory_1.save(
                    s, a, r, s_) if r > 0 else replay_memory_2.save(
                        s, a, r, s_)  # Save non-final transition into memory

            #learn form memory
            if len(replay_memory_1) >= f_1 and len(
                    replay_memory_2
            ) >= f_2:  # if positive D is enough, the other must as well
                transitions_1 = replay_memory_1.sample(
                    f_1)  # Sample from 2 buckets
                batch1 = Transition(*zip(*transitions_1))
                transitions_2 = replay_memory_2.sample(f_2)
                batch2 = Transition(*zip(*transitions_2))

                s_b = np.append(np.asarray(batch1.state),
                                np.asarray(batch2.state),
                                axis=0)
                s_b_n = np.append(np.asarray(batch1.next_state),
                                  np.asarray(batch2.next_state),
                                  axis=0)
                a_b = np.append(np.asarray(batch1.action).reshape(f_1, 1),
                                np.asarray(batch2.action).reshape(f_2, 1),
                                axis=0)
                r_b = np.append(np.asarray(batch1.reward).reshape(f_1, 1),
                                np.asarray(batch2.reward).reshape(f_2, 1),
                                axis=0)

                td_error, abs_error = critic.learn(s_b, r_b,
                                                   s_b_n)  # Critic Learn
                actor.learn(s_b, a_b, td_error)  # Actor Learn

            s = s_

            ##print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())
            #env.render()

            if done or t >= MAX_EP_STEPS:  # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
Example #7
0
    critic = Critic(sess=sess, n_features=N_F, gamma=GAMMA, lr=LR_C)

    sess.run(tf.global_variables_initializer())

    if OUTPUT_GRAPH:
        tf.summary.FileWriter('logs/', sess.graph)

    for epoch in range(MAX_EPISODE):
        s = env.reset()
        t = 0
        track_r = []

        while True:
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)

            if done: r = -20

            track_r.append(r)

            td_error = critic.learn(s, r, s_)

            actor.learn(s, a, td_error)

            s = s_
            t += 1

            if done or t >= MAX_EP_STEPS:
    actor.build_net()
    critic.build_net()

    sess.run(tf.global_variables_initializer())

    #tf.summary.FileWriter("",sess.graph)
    #count steps
    step = 0
    #env.render()
    for episode in range(n_episode):
        s = env.reset()
        #comment the render() to speed up
        #env.render()
        #s returned by gym is a vector, we need to transform it into a matrix
        s = s[np.newaxis, :]
        a = actor.choose_action(s)
        while(True):
            step += 1
            #a new transition
            s_, r, done, info = env.step(a)
            #in order to let s_ add one rank(matrix)
            s_ = s_[np.newaxis,:]
            a_ = actor.choose_action(s_)
            #calculate td_error
            td_error = critic.learn(s,s_)
            actor.learn(s,a,td_error)
            s =s_

            if step%500 == 0:
                print(step,s_)
critic = Critic(sess,n_features=n_features,gamma = GAMMA,lr = LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):

    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)
        s_,r,done,info = env.step(a)

        if done: r = -20

        track_r.append(r)

        td_error = critic.learn(s,r,s_)
        actor.learn(s,a,td_error)

        s = s_
        t += 1

        if done or t >= MAX_EP_STEPS:
            ep_rs_sum = sum(track_r)