Python Reacher Examples

Programming Language: Python

Namespace/Package Name: env

Class/Type: Reacher

Examples at hotexamples.com: 5

Python Reacher - 5 examples found. These are the top rated real world Python examples of env.Reacher extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Reacher(4)

step(3)

reset(2)

Frequently Used Methods

Reacher (4)

step (3)

reset (2)

Example #1

Show file

File: ppo_reptile.py Project: zwfightzw/Benchmark-Efficient-Reinforcement-Learning-with-Demonstrations

def sample_task():
    range_pose = 0.3
    target_pose = np.random.rand(2) * range_pose + [0.5, 0.5]
    screen_size = 1000
    target_pose = target_pose * screen_size

    env = Reacher(target_pos=target_pose, render=True)
    return env, target_pose

Example #2

Show file

 def __init__(self, wid):
     self.wid = wid
     # self.env = gym.make(GAME).unwrapped
     self.env=Reacher(render=True)
     self.ppo = GLOBAL_PPO

Example #3

Show file

if __name__ == '__main__':
    GLOBAL_PPO = PPO()
    UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
    UPDATE_EVENT.clear()            # not update now
    ROLLING_EVENT.set()             # start to roll out
    workers = [Worker(wid=i) for i in range(N_WORKER)]
    
    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    GLOBAL_RUNNING_R = []
    COORD = tf.train.Coordinator()
    QUEUE = queue.Queue()           # workers putting data in this queue
    threads = []
    for worker in workers:          # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()                   # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update,))
    threads[-1].start()
    COORD.join(threads)

    # plot reward change and test
    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
    plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
    # env = gym.make('Pendulum-v0')
    env=Reacher(render=True)
    while True:
        s = env.reset()
        for t in range(300):
            # env.render()
            s = env.step(GLOBAL_PPO.choose_action(s))[0]

Example #4

Show file

class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        # self.env = gym.make(GAME).unwrapped
        self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set=[]
        epr_set=[]
        step=0
        while not COORD.should_stop():
            s = self.env.reset()
            step+=1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():                  # while global PPO is updating
                    ROLLING_EVENT.wait()                        # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                
                s_, r, done = self.env.step(a)
                # print('a: ',a)  # shape: []
                # print('s: ',s_) # shape: []
                # print('r: ',r) # shape: scalar
                # print('done: ', done)  # shape: True/False
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append((r + 8) / 8)                    # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []                           # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))          # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()       # stop collecting data
                        UPDATE_EVENT.set()          # globalPPO update

                    if GLOBAL_EP >= EP_MAX:         # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
            GLOBAL_EP += 1
            print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
            step_set.append(step)
            epr_set.append(ep_r)
            if step % 500==0:
                plt.plot(step_set,epr_set)
                plt.savefig('./ppo.png')

Example #5

Show file

        self.rewards = []
        self.entropy = []

    def forward(self, x):
        x1 = self.hidden_acti(self.affine1(x))
        x2 = self.hidden_acti(self.affine2(x1))
        action_dis_mu = 360. * self.output_acti(self.affine3_mu(x2))
        # scale control action exploration noise, its a 1 dim tensor, should be learnable, but not here
        # scale = torch.from_numpy(np.array(self.output_size*[1.])).float()
        scale = self.output_acti_sigma(self.affine3_sigma(x2))

        return action_dis_mu, scale
        # return Normal(loc=action_dis_mu, scale=scale)


env = Reacher(render=True)
policy = Policy(env.num_observations, 100, 200, env.num_actions)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()


def select_action(state):

    state = torch.from_numpy(state).float()  # state: 2 dim tensor
    mu, scale = policy(state)
    # print('mu: ', mu)
    # print('scale: ',scale.squeeze())
    action_dis = Normal(loc=mu, scale=scale.squeeze())  # mu is 2d, scale is 1d
    # scale+=1e-6
    # action_dis =Normal(loc=mu, scale=scale)