def sample_task():
    range_pose = 0.3
    target_pose = np.random.rand(2) * range_pose + [0.5, 0.5]
    screen_size = 1000
    target_pose = target_pose * screen_size

    env = Reacher(target_pos=target_pose, render=True)
    return env, target_pose
コード例 #2
0
 def __init__(self, wid):
     self.wid = wid
     # self.env = gym.make(GAME).unwrapped
     self.env=Reacher(render=True)
     self.ppo = GLOBAL_PPO
コード例 #3
0
if __name__ == '__main__':
    GLOBAL_PPO = PPO()
    UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
    UPDATE_EVENT.clear()            # not update now
    ROLLING_EVENT.set()             # start to roll out
    workers = [Worker(wid=i) for i in range(N_WORKER)]
    
    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    GLOBAL_RUNNING_R = []
    COORD = tf.train.Coordinator()
    QUEUE = queue.Queue()           # workers putting data in this queue
    threads = []
    for worker in workers:          # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()                   # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update,))
    threads[-1].start()
    COORD.join(threads)

    # plot reward change and test
    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
    plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
    # env = gym.make('Pendulum-v0')
    env=Reacher(render=True)
    while True:
        s = env.reset()
        for t in range(300):
            # env.render()
            s = env.step(GLOBAL_PPO.choose_action(s))[0]
コード例 #4
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        # self.env = gym.make(GAME).unwrapped
        self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set=[]
        epr_set=[]
        step=0
        while not COORD.should_stop():
            s = self.env.reset()
            step+=1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():                  # while global PPO is updating
                    ROLLING_EVENT.wait()                        # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                
                s_, r, done = self.env.step(a)
                # print('a: ',a)  # shape: []
                # print('s: ',s_) # shape: []
                # print('r: ',r) # shape: scalar
                # print('done: ', done)  # shape: True/False
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append((r + 8) / 8)                    # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []                           # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))          # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()       # stop collecting data
                        UPDATE_EVENT.set()          # globalPPO update

                    if GLOBAL_EP >= EP_MAX:         # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
            GLOBAL_EP += 1
            print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
            step_set.append(step)
            epr_set.append(ep_r)
            if step % 500==0:
                plt.plot(step_set,epr_set)
                plt.savefig('./ppo.png')
コード例 #5
0
        self.rewards = []
        self.entropy = []

    def forward(self, x):
        x1 = self.hidden_acti(self.affine1(x))
        x2 = self.hidden_acti(self.affine2(x1))
        action_dis_mu = 360. * self.output_acti(self.affine3_mu(x2))
        # scale control action exploration noise, its a 1 dim tensor, should be learnable, but not here
        # scale = torch.from_numpy(np.array(self.output_size*[1.])).float()
        scale = self.output_acti_sigma(self.affine3_sigma(x2))

        return action_dis_mu, scale
        # return Normal(loc=action_dis_mu, scale=scale)


env = Reacher(render=True)
policy = Policy(env.num_observations, 100, 200, env.num_actions)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()


def select_action(state):

    state = torch.from_numpy(state).float()  # state: 2 dim tensor
    mu, scale = policy(state)
    # print('mu: ', mu)
    # print('scale: ',scale.squeeze())
    action_dis = Normal(loc=mu, scale=scale.squeeze())  # mu is 2d, scale is 1d
    # scale+=1e-6
    # action_dis =Normal(loc=mu, scale=scale)