Beispiel #1
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = ArmEnv(mode=MODE[n_model])
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer
                a = self.ppo.choose_action(s)
                s_, r, done = self.env.step(a)
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0:
                GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = ArmEnv(mode=MODE[n_model])
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():                  # while global PPO is updating
                    ROLLING_EVENT.wait()                        # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer
                a = self.ppo.choose_action(s)
                s_, r, done = self.env.step(a)
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)                    # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []                           # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()       # stop collecting data
                        UPDATE_EVENT.set()          # globalPPO update

                    if GLOBAL_EP >= EP_MAX:         # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
            GLOBAL_EP += 1
            print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
Beispiel #3
0
def run(args):
    state_dim = 5
    action_dim = 6
    train_dir = create_dir('./result')
    agent = DDPG(state_dim, action_dim, train_dir=train_dir, gamma=args.gamma)
    agent.explore_noise.theta = 1.0
    agent.explore_noise.sigma = 2.0
    env = ArmEnv(image_shape=agent.image_size, max_move_step=args.tmax, gamma=args.gamma)

    t_train, t_test = 0, 0
    experiment = Experiment(env, agent, args.tmax)
    while True:
        # test
        T = t_test
        R = []
        while t_test - T < args.test:
            r, t = experiment.run_episode(test=True)
            R.append(r)
            t_test += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info('Average test return\t{} after {} timesteps of training.'.format(avr, t_train))
        # train
        T = t_train
        R = []
        while t_train - T < args.train:
            r, t = experiment.run_episode(test=False)
            R.append(r)
            t_train += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info('Average train return\t{} after {} timesteps of training'.format(avr, t_train))
Beispiel #4
0
def play():
    print('Playing...')
    env = ArmEnv()
    ppo = PPO()
    ppo.load('unity-arm')
    s = env.reset()
    while True:
        a = ppo.choose_action(s)
        s, r, done = env.step(a)
        env.render()
Beispiel #5
0
 def __init__(self, name, globalAC):
     self.env = ArmEnv(mode=MODE[n_model])
     self.name = name
     self.AC = ACNet(name, globalAC)
Beispiel #6
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = ArmEnv(mode=MODE[n_model])
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.render()
                a = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if ep_t == MAX_EP_STEP - 1: done = True
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v,
                                        {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    test = self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)
                    print((
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        '| Var:',
                        test,
                    ))
                    GLOBAL_EP += 1
                    break
 def __init__(self, wid):
     self.wid = wid
     self.env = ArmEnv(mode=MODE[n_model])
     self.ppo = GLOBAL_PPO
Beispiel #8
0
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 500
REPLACE_ITER_C = 500
MEMORY_CAPACITY = 10000
BATCH_SIZE = 64
TAU = 0.001  # soft replacement
VAR_MIN = 0.01
RENDER = False
LOAD = False
MODE = ['easy', 'hard']
SPARSE = True
n_model = 1
use_her = True
K = 4

env = ArmEnv(mode=MODE[n_model], sparse=SPARSE)
STATE_DIM = env.state_dim

ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound


class Episode_experience():
    def __init__(self):
        self.memory = []

    def add(self, state, action, reward, next_state, done, goal):
        self.memory += [(state, action, reward, next_state, done, goal)]

    def clear(self):
        self.memory = []
Beispiel #9
0
from arm_env import ArmEnv


EP_MAX = 2000
EP_LEN = 300
N_WORKER = 4                # parallel workers
GAMMA = 0.9                 # reward discount factor
A_LR = 0.0001               # learning rate for actor
C_LR = 0.0005                # learning rate for critic
MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
UPDATE_STEP = 5             # loop update operation n-steps
EPSILON = 0.2               # Clipped surrogate objective
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv(mode=MODE[n_model])
S_DIM = env.state_dim
A_DIM = env.action_dim
A_BOUND = env.action_bound[1]


class PPO(object):
    def __init__(self):
        self.sess = tf.compat.v1.Session()

        self.tfs = tf.compat.v1.placeholder(tf.float32, [None, S_DIM], 'state')

        # critic
        l1 = tf.compat.v1.layers.dense(self.tfs, 100, tf.nn.relu)
        self.v = tf.compat.v1.layers.dense(l1, 1)
        self.tfdc_r = tf.compat.v1.placeholder(tf.float32, [None, 1], 'discounted_r')
Beispiel #10
0
MAX_EPISODES = 1000
MAX_EP_STEPS = 500
LR_A = 1e-5  # learning rate for actor
LR_C = 1e-5  # learning rate for critic
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 5000
BATCH_SIZE = 32
VAR_MIN = 0.1
RENDER = True
LOAD = False
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv()
# STATE_DIM = env.state_dim
# ACTION_DIM = env.action_dim
# ACTION_BOUND = env.action_bound

STATE_DIM = env.s.shape[0]
ACTION_DIM = env.joint.shape[0]

ACTION_BOUND = [-20, 20]

# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
    R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
Beispiel #11
0
from arm_env import ArmEnv

MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv(mode=MODE[n_model])

env = ArmEnv(mode=MODE[n_model])
S_DIM = env.state_dim
A_DIM = env.action_dim
A_BOUND = env.action_bound[1]

print(S_DIM)
print(A_DIM)
print(A_BOUND)
s = env.reset()
print(s)
a = env.sample_action()
print(a)
r = env.step(a)
print(r)
Beispiel #12
0
def train(config={}, use_unity_arm=False):
    tf.reset_default_graph()

    should_random_target = 'should_random_target' in config.keys(
    ) and config['should_random_target']

    env = UnityArmEnv() if use_unity_arm else ArmEnv(
        mode='easy', should_random_target=should_random_target)
    config['A_DIM'] = env.action_dim
    config['S_DIM'] = env.state_dim
    ppo = PPO(config)
    all_ep_r = []
    lambdas = []

    should_render = 'should_render' in config.keys(
    ) and config['should_render']

    start = time.clock()

    plot = Plot()

    for ep in range(ppo.EP_MAX):
        s = env.reset()
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in tqdm(range(ppo.EP_LEN),
                      desc='Training EP-' + str(ep) + '/' + str(ppo.EP_MAX) +
                      ': '):  # in one episode
            if should_render:
                env.render()
            a = ppo.choose_action(s)
            s_, r, done = env.step(a)
            buffer_s.append(s)
            buffer_a.append(a)
            buffer_r.append(r)  # normalize reward, find to be useful
            s = s_
            ep_r += r

            # update ppo
            if (t + 1) % ppo.BATCH == 0 or t == ppo.EP_LEN - 1:
                v_s_ = ppo.get_v(s_)
                discounted_r = []
                for r in buffer_r[::-1]:
                    v_s_ = r + ppo.GAMMA * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()

                bs, ba, br = np.vstack(buffer_s), np.vstack(
                    buffer_a), np.array(discounted_r)[:, np.newaxis]
                buffer_s, buffer_a, buffer_r = [], [], []
                # bs = (bs - bs.mean()) / (bs.std() + 1e-6)
                # br = (br - br.mean()) / (br.std() + 1e-6)
                ppo.update(bs, ba, br)
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)

        print('Current Reward: ', ep_r)
        plot.update(all_ep_r)

        if ppo.METHOD['name'] == 'kl_pen':
            lambdas.append(ppo.METHOD['lam'])

    elapsed = time.clock() - start

    print('Train with method {} done!'.format(ppo.METHOD['name']))
    print('Time elapsed {}s'.format(elapsed))

    return {
        'method': ppo.METHOD['name'],
        'ep_r': all_ep_r,
        'lambda': lambdas,
        'time': elapsed,  # 耗时
        'config': config,  # 当前变量
    }, ppo, env
 def __init__(self, name, globalAC):
     self.env = ArmEnv(mode=MODE[n_model])
     self.name = name
     self.AC = ACNet(name, globalAC)
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = ArmEnv(mode=MODE[n_model])
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.render()
                a = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if ep_t == MAX_EP_STEP - 1: done = True
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
                    if done:
                        v_s_ = 0   # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:    # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    test = self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:", GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        '| Var:', test,

                          )
                    GLOBAL_EP += 1
                    break
Beispiel #15
0
    return np.array([1 if a == i else 0 for i in range(n)])


def discount_and_norm_rewards(episode_rewards, gamma):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0
    for t in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[t]
        discounted_episode_rewards[t] = cumulative
    return discounted_episode_rewards


env = ArmEnv(size_x=4,
             size_y=3,
             cubes_cnt=4,
             episode_max_length=2000,
             finish_reward=200,
             action_minus_reward=0.0,
             tower_target_size=3)

s = env.reset()
obs_len = len(s)

tf.reset_default_graph()

state = tf.placeholder('float32', shape=[None, obs_len], name="STATE")
actions = tf.squeeze(tf.placeholder('int32', name="ACTIONS"))
'''
============================
Actor = Policy Approxiamtion
============================
Beispiel #16
0
def train(config={}):
    tf.reset_default_graph()
    env = ArmEnv(mode='easy', should_random_target=True)
    ppo = PPO(config)
    all_ep_r = []
    lambdas = []

    should_render = 'should_render' in config.keys() and config['should_render']

    start = time.clock()

    for ep in tqdm(range(ppo.EP_MAX), desc='Training'):
        s = env.reset()
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in range(ppo.EP_LEN):    # in one episode
            if should_render:
                env.render()
            a = ppo.choose_action(s)
            s_, r, done = env.step(a)
            buffer_s.append(s)
            buffer_a.append(a)
            buffer_r.append((r+8)/8)    # normalize reward, find to be useful
            s = s_
            ep_r += r

            # update ppo
            if (t+1) % ppo.BATCH == 0 or t == ppo.EP_LEN-1:
                v_s_ = ppo.get_v(s_)
                discounted_r = []
                for r in buffer_r[::-1]:
                    v_s_ = r + ppo.GAMMA * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()

                bs, ba, br = np.vstack(buffer_s), np.vstack(
                    buffer_a), np.array(discounted_r)[:, np.newaxis]
                buffer_s, buffer_a, buffer_r = [], [], []
                ppo.update(bs, ba, br)
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
        print(
            'Ep: %i' % ep,
            "|Ep_r: %i" % ep_r,
            ("|Lam: %.4f" %
             ppo.METHOD['lam']) if ppo.METHOD['name'] == 'kl_pen' else '',
        )
        if ppo.METHOD['name'] == 'kl_pen':
            lambdas.append(ppo.METHOD['lam'])

    elapsed = time.clock() - start

    print('Train with method {} done!'.format(ppo.METHOD['name']))
    print('Time elapsed {}s'.format(elapsed))

    if 'should_save' in config and config['should_save']:
        ppo.save('arm')

    return {
        'method': ppo.METHOD['name'],
        'ep_r': all_ep_r,
        'lambda': lambdas,
        'time': elapsed, # 耗时
        'config': config, # 当前变量
    }
Beispiel #17
0
MAX_EPISODES = 600
MAX_EP_STEPS = 200
LR_A = 1e-4  # learning rate for actor
LR_C = 1e-4  # learning rate for critic
GAMMA = 0.999  # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 10000
BATCH_SIZE = 16
VAR_MIN = 0.1
RENDER = True
LOAD = False
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv(mode=MODE[n_model])
STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound

# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('A'):
    A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a')
with tf.name_scope('R'):
    R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')

from arm_env import ArmEnv


EP_MAX = 2000
EP_LEN = 300
N_WORKER = 4                # parallel workers
GAMMA = 0.9                 # reward discount factor
A_LR = 0.0001               # learning rate for actor
C_LR = 0.0005                # learning rate for critic
MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
UPDATE_STEP = 5             # loop update operation n-steps
EPSILON = 0.2               # Clipped surrogate objective
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv(mode=MODE[n_model])
S_DIM = env.state_dim
A_DIM = env.action_dim
A_BOUND = env.action_bound[1]


class PPO(object):
    def __init__(self):
        self.sess = tf.Session()

        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')

        # critic
        l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
        self.v = tf.layers.dense(l1, 1)
        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
Beispiel #19
0
 def __init__(self, wid):
     self.wid = wid
     self.env = ArmEnv(mode=MODE[n_model])
     self.ppo = GLOBAL_PPO
Beispiel #20
0
from arm_env import ArmEnv
import pyglet
from pyglet.window import key
import random
import cv2
import numpy as np

MODE = ['easy', 'hard']
n_model = 0

env = ArmEnv(mode=MODE[n_model])
env.set_fps(30)

env.render()
env.step([3, 3])
env.render()
while True:
    env.step([0, 0])
    env.render()
    #for j in range(100000000):
    #    continue

    #cv2.imwrite('loop'+str(t)+'.jpg',im)

    #v1 = random.randint(1, 10)
    #v2 = random.randint(1, 10)

    #env.step2([100,100])
    #env.render()

    #env.step2([100,50])