class Worker(object): def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
class Worker(object): def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1) GLOBAL_EP += 1 print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
def run(args): state_dim = 5 action_dim = 6 train_dir = create_dir('./result') agent = DDPG(state_dim, action_dim, train_dir=train_dir, gamma=args.gamma) agent.explore_noise.theta = 1.0 agent.explore_noise.sigma = 2.0 env = ArmEnv(image_shape=agent.image_size, max_move_step=args.tmax, gamma=args.gamma) t_train, t_test = 0, 0 experiment = Experiment(env, agent, args.tmax) while True: # test T = t_test R = [] while t_test - T < args.test: r, t = experiment.run_episode(test=True) R.append(r) t_test += t if len(R) > 0: avr = sum(R) / len(R) logger.info('Average test return\t{} after {} timesteps of training.'.format(avr, t_train)) # train T = t_train R = [] while t_train - T < args.train: r, t = experiment.run_episode(test=False) R.append(r) t_train += t if len(R) > 0: avr = sum(R) / len(R) logger.info('Average train return\t{} after {} timesteps of training'.format(avr, t_train))
def play(): print('Playing...') env = ArmEnv() ppo = PPO() ppo.load('unity-arm') s = env.reset() while True: a = ppo.choose_action(s) s, r, done = env.step(a) env.render()
def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC)
class Worker(object): def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print(( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, )) GLOBAL_EP += 1 break
def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO
GAMMA = 0.9 # reward discount REPLACE_ITER_A = 500 REPLACE_ITER_C = 500 MEMORY_CAPACITY = 10000 BATCH_SIZE = 64 TAU = 0.001 # soft replacement VAR_MIN = 0.01 RENDER = False LOAD = False MODE = ['easy', 'hard'] SPARSE = True n_model = 1 use_her = True K = 4 env = ArmEnv(mode=MODE[n_model], sparse=SPARSE) STATE_DIM = env.state_dim ACTION_DIM = env.action_dim ACTION_BOUND = env.action_bound class Episode_experience(): def __init__(self): self.memory = [] def add(self, state, action, reward, next_state, done, goal): self.memory += [(state, action, reward, next_state, done, goal)] def clear(self): self.memory = []
from arm_env import ArmEnv EP_MAX = 2000 EP_LEN = 300 N_WORKER = 4 # parallel workers GAMMA = 0.9 # reward discount factor A_LR = 0.0001 # learning rate for actor C_LR = 0.0005 # learning rate for critic MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO UPDATE_STEP = 5 # loop update operation n-steps EPSILON = 0.2 # Clipped surrogate objective MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) S_DIM = env.state_dim A_DIM = env.action_dim A_BOUND = env.action_bound[1] class PPO(object): def __init__(self): self.sess = tf.compat.v1.Session() self.tfs = tf.compat.v1.placeholder(tf.float32, [None, S_DIM], 'state') # critic l1 = tf.compat.v1.layers.dense(self.tfs, 100, tf.nn.relu) self.v = tf.compat.v1.layers.dense(l1, 1) self.tfdc_r = tf.compat.v1.placeholder(tf.float32, [None, 1], 'discounted_r')
MAX_EPISODES = 1000 MAX_EP_STEPS = 500 LR_A = 1e-5 # learning rate for actor LR_C = 1e-5 # learning rate for critic GAMMA = 0.9 # reward discount REPLACE_ITER_A = 1100 REPLACE_ITER_C = 1000 MEMORY_CAPACITY = 5000 BATCH_SIZE = 32 VAR_MIN = 0.1 RENDER = True LOAD = False MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv() # STATE_DIM = env.state_dim # ACTION_DIM = env.action_dim # ACTION_BOUND = env.action_bound STATE_DIM = env.s.shape[0] ACTION_DIM = env.joint.shape[0] ACTION_BOUND = [-20, 20] # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'):
from arm_env import ArmEnv MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) env = ArmEnv(mode=MODE[n_model]) S_DIM = env.state_dim A_DIM = env.action_dim A_BOUND = env.action_bound[1] print(S_DIM) print(A_DIM) print(A_BOUND) s = env.reset() print(s) a = env.sample_action() print(a) r = env.step(a) print(r)
def train(config={}, use_unity_arm=False): tf.reset_default_graph() should_random_target = 'should_random_target' in config.keys( ) and config['should_random_target'] env = UnityArmEnv() if use_unity_arm else ArmEnv( mode='easy', should_random_target=should_random_target) config['A_DIM'] = env.action_dim config['S_DIM'] = env.state_dim ppo = PPO(config) all_ep_r = [] lambdas = [] should_render = 'should_render' in config.keys( ) and config['should_render'] start = time.clock() plot = Plot() for ep in range(ppo.EP_MAX): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in tqdm(range(ppo.EP_LEN), desc='Training EP-' + str(ep) + '/' + str(ppo.EP_MAX) + ': '): # in one episode if should_render: env.render() a = ppo.choose_action(s) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r # update ppo if (t + 1) % ppo.BATCH == 0 or t == ppo.EP_LEN - 1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + ppo.GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] # bs = (bs - bs.mean()) / (bs.std() + 1e-6) # br = (br - br.mean()) / (br.std() + 1e-6) ppo.update(bs, ba, br) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) print('Current Reward: ', ep_r) plot.update(all_ep_r) if ppo.METHOD['name'] == 'kl_pen': lambdas.append(ppo.METHOD['lam']) elapsed = time.clock() - start print('Train with method {} done!'.format(ppo.METHOD['name'])) print('Time elapsed {}s'.format(elapsed)) return { 'method': ppo.METHOD['name'], 'ep_r': all_ep_r, 'lambda': lambdas, 'time': elapsed, # 耗时 'config': config, # 当前变量 }, ppo, env
class Worker(object): def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
return np.array([1 if a == i else 0 for i in range(n)]) def discount_and_norm_rewards(episode_rewards, gamma): discounted_episode_rewards = np.zeros_like(episode_rewards) cumulative = 0 for t in reversed(range(len(episode_rewards))): cumulative = cumulative * gamma + episode_rewards[t] discounted_episode_rewards[t] = cumulative return discounted_episode_rewards env = ArmEnv(size_x=4, size_y=3, cubes_cnt=4, episode_max_length=2000, finish_reward=200, action_minus_reward=0.0, tower_target_size=3) s = env.reset() obs_len = len(s) tf.reset_default_graph() state = tf.placeholder('float32', shape=[None, obs_len], name="STATE") actions = tf.squeeze(tf.placeholder('int32', name="ACTIONS")) ''' ============================ Actor = Policy Approxiamtion ============================
def train(config={}): tf.reset_default_graph() env = ArmEnv(mode='easy', should_random_target=True) ppo = PPO(config) all_ep_r = [] lambdas = [] should_render = 'should_render' in config.keys() and config['should_render'] start = time.clock() for ep in tqdm(range(ppo.EP_MAX), desc='Training'): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(ppo.EP_LEN): # in one episode if should_render: env.render() a = ppo.choose_action(s) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append((r+8)/8) # normalize reward, find to be useful s = s_ ep_r += r # update ppo if (t+1) % ppo.BATCH == 0 or t == ppo.EP_LEN-1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + ppo.GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] ppo.update(bs, ba, br) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1) print( 'Ep: %i' % ep, "|Ep_r: %i" % ep_r, ("|Lam: %.4f" % ppo.METHOD['lam']) if ppo.METHOD['name'] == 'kl_pen' else '', ) if ppo.METHOD['name'] == 'kl_pen': lambdas.append(ppo.METHOD['lam']) elapsed = time.clock() - start print('Train with method {} done!'.format(ppo.METHOD['name'])) print('Time elapsed {}s'.format(elapsed)) if 'should_save' in config and config['should_save']: ppo.save('arm') return { 'method': ppo.METHOD['name'], 'ep_r': all_ep_r, 'lambda': lambdas, 'time': elapsed, # 耗时 'config': config, # 当前变量 }
MAX_EPISODES = 600 MAX_EP_STEPS = 200 LR_A = 1e-4 # learning rate for actor LR_C = 1e-4 # learning rate for critic GAMMA = 0.999 # reward discount REPLACE_ITER_A = 1100 REPLACE_ITER_C = 1000 MEMORY_CAPACITY = 10000 BATCH_SIZE = 16 VAR_MIN = 0.1 RENDER = True LOAD = False MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) STATE_DIM = env.state_dim ACTION_DIM = env.action_dim ACTION_BOUND = env.action_bound # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('A'): A = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name='a') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
from arm_env import ArmEnv EP_MAX = 2000 EP_LEN = 300 N_WORKER = 4 # parallel workers GAMMA = 0.9 # reward discount factor A_LR = 0.0001 # learning rate for actor C_LR = 0.0005 # learning rate for critic MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO UPDATE_STEP = 5 # loop update operation n-steps EPSILON = 0.2 # Clipped surrogate objective MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) S_DIM = env.state_dim A_DIM = env.action_dim A_BOUND = env.action_bound[1] class PPO(object): def __init__(self): self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') # critic l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu) self.v = tf.layers.dense(l1, 1) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
from arm_env import ArmEnv import pyglet from pyglet.window import key import random import cv2 import numpy as np MODE = ['easy', 'hard'] n_model = 0 env = ArmEnv(mode=MODE[n_model]) env.set_fps(30) env.render() env.step([3, 3]) env.render() while True: env.step([0, 0]) env.render() #for j in range(100000000): # continue #cv2.imwrite('loop'+str(t)+'.jpg',im) #v1 = random.randint(1, 10) #v2 = random.randint(1, 10) #env.step2([100,100]) #env.render() #env.step2([100,50])