def train(env_id, num_timesteps, seed): from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3) env = JacoEnv(64, 64, 1, 1.0) #make_mujoco_env(env_id, seed) dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1) reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3) ppo_gail.learn( env, policy_fn, reward_giver, dataset, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def __init__(self, wid): self.wid = wid #self.env = gym.make(GAME).unwrapped self.env = JacoEnv(64, 64, 100) self.ppo = GLOBAL_PPO if self.wid == 0: self.viewer = mujoco_py.MjViewer(self.env.sim)
def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS): threading.Thread.__init__(self) self.render = render self.env = JacoEnv(64, 64, 100, 0.1, 0.8, True) self.agent = Agent(eps_start, eps_end, eps_steps)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = JacoEnv()#env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) #env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
class Environment(threading.Thread): stop_signal = False def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS): threading.Thread.__init__(self) self.render = render self.env = JacoEnv(64, 64, 100, 0.1, 0.8, True) self.agent = Agent(eps_start, eps_end, eps_steps) def runEpisode(self): s = self.env.reset() R = 0 while True: time.sleep(THREAD_DELAY) # yield if self.render: self.env.render() a = self.agent.act(s) s_, r, done, info = self.env.step(a) # print(self.ident, info['step']) if done: # terminal state s_ = None self.agent.train(s, a, r, s_) s = s_ R += r if done or self.stop_signal: break print("Total R:", R) def run(self): while not self.stop_signal: self.runEpisode() def stop(self): self.stop_signal = True
class JacoEnvRandomAgent(): def __init__(self, width, height, frame_skip, rewarding_distance, control_magnitude, reward_continuous, render): self.env = JacoEnv(width, height, frame_skip, rewarding_distance, control_magnitude, reward_continuous) self.render = render def run(self): (_, _, obs_rgb_view2) = self.env.reset() if self.render: viewer = mujoco_py.MjViewer(self.env.sim) else: f, ax = plt.subplots() im = ax.imshow(obs_rgb_view2) while True: self.env.reset() while True: # random action selection action = np.random.choice([0, 1, 2, 3, 4], 6) # take the random action and observe the reward and next state (2 rgb views and proprioception) (obs_joint, obs_rgb_view1, obs_rgb_view2), reward, done = self.env.step(action) # print("action : ", action) # print("reward : ", reward) if done: break if self.render: viewer.render() else: im.set_data(obs_rgb_view2) plt.draw() plt.pause(0.1)
class Worker(object): def __init__(self, wid): self.wid = wid #self.env = gym.make(GAME).unwrapped self.env = JacoEnv(64, 64, 100) self.ppo = GLOBAL_PPO if self.wid == 0: self.viewer = mujoco_py.MjViewer(self.env.sim) def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data if self.wid == 0: self.viewer.render() a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append( (r + 8) / 8) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if done: break with open("reward.txt", "a") as f: f.write(str(ep_r) + '\n') # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 # r_d = 200 / (sum(GLOBAL_RUNNING_R[:-10:-1])/10 + 250 + GLOBAL_EP) # print(r_d) #self.env.reduce_rewarding_distance(r_d) #if sum(GLOBAL_RUNNING_R[:-11:-1])/10 > 100: # self.env.reset_target() # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 0: # self.env.reset_target() # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 1: # self.env.reset_target() # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 2: # self.env.reset_target() # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 3: # self.env.reset_target() # if sum(GLOBAL_RUNNING_R[:-11:-1])/10 > 1500: # with open("state.txt", "a") as f: # f.write(str(self.env.sim.model.body_pos[-1]) + '\n') # f.write(str(self.env.sim.model.geom_pos[-1]) + '\n') #print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,) print( GLOBAL_EP, '/', EP_MAX, '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] #Global_reward COORD = tf.train.Coordinator() QUEUE = queue.Queue() # workers putting data in this queue threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() COORD.join(threads) # plot reward change and test plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) plt.xlabel('Episode') plt.ylabel('Moving reward') plt.ion() plt.show() env = JacoEnv(64, 64, 100) viewer = mujoco_py.MjViewer(env.sim) while True: s = env.reset() for t in range(200): viewer.render() s = env.step(GLOBAL_PPO.choose_action(s))[0]
def run(): """Construct and start the environment.""" env = JacoEnv(64, 64, 100, 0.1, 0.8, True) nb_actions = env.real_num_actions # All possible action, where each action is a unit in this vector new_floor_color = list((0.55 - 0.45) * np.random.random(3) + 0.45) + [1.] new_cube_color = list(np.random.random(3)) + [1.] env.change_floor_color(new_floor_color) env.change_cube_color(new_cube_color) encoder = load_model(WEIGHTS_FILE) print("#########################") nb_observation_space = (64, 64, 3) original_input = Input(shape=(WINDOW_LENGTH,) + nb_observation_space) in_layer = [Lambda(lambda x: x[:, i, :, :])(original_input) for i in range(WINDOW_LENGTH)] for layer in encoder.layers: layer.trainable = False print(encoder.summary()) encoder_output = [encoder(x) for x in in_layer] x = Concatenate()(encoder_output) x = Dense(512, activation='relu')(x) x = Dense(512, activation='relu')(x) x = Dense(nb_actions, activation='linear')(x) model = Model(original_input, [x]) print(model.summary()) if MULTI_GPU: model = multi_gpu_model(model, gpus=2) print(model.summary()) num_warmup = 50000 # num_simulated_annealing = 500000 + num_warmup # num_warmup = 0 num_simulated_annealing = 220000 + num_warmup memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=num_simulated_annealing) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=num_warmup, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if False: dqn.load_weights("stylegan_dqn_weights") checkpoint_callback = ModelCheckpoint("stylegan_dqn_checkpoint", monitor='episode_reward', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period = 10) history = dqn.fit(env, nb_steps=num_simulated_annealing + 450000, visualize=False, verbose=1, callbacks=[checkpoint_callback]) dqn.save_weights("stylegan_dqn_weights") np.savez_compressed("stylegan_dqn_history", episode_reward=np.asarray(history.history['episode_reward'])) else: dqn.load_weights("stylegan_dqn_weights") print("original domain") source_test_losses = dqn.test(env, nb_episodes=100, visualize=True) np.savez_compressed("myvae_dqn_source_test", episode_reward=np.asarray(source_test_losses.history['episode_reward']), nb_steps=np.asarray(source_test_losses.history['nb_steps'])) print("target domain") new_floor_color = [0.4, 0.6, 0.4, 1.] new_cube_color = [1.0, 0.0, 0.0, 1.] env.change_floor_color(new_floor_color) env.change_cube_color(new_cube_color) target_test_losses = dqn.test(env, nb_episodes=100, visualize=True) np.savez_compressed("myvae_dqn_target_test", episode_reward=np.asarray(target_test_losses.history['episode_reward']), nb_steps=np.asarray(target_test_losses.history['nb_steps']))
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) env.seed(args.seed + rank) if args.render: (_, _, obs_rgb_view2) = env.reset() plt.ion() f, ax = plt.subplots() im = ax.imshow(obs_rgb_view2) model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting n_digits = str( len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable( torch.zeros(1, args.hidden_size), volatile=True) cx = Variable( torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = (0, 0, 0, 0, 0, 0), 0, False, 0 reward_sum = 0 # Calculate policy policy, _, (hx, cx) = model( Variable( state[0], volatile=True), Variable( state[1], volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = [p.max(1)[1].data[0, 0] for p in policy] # Step state, reward, done = env.step(action) obs_rgb_view1 = state[1] obs_rgb_view2 = state[2] state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Optionally render validation states if args.render: # rendering the first camera view im.set_data(obs_rgb_view1) plt.draw() plt.pause(0.05) # rendering mujoco simulation # viewer = mujoco_py.MjViewer(env.sim) # viewer.render() # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + n_digits + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), os.path.join('results', str(t_start) + '_model.pth')) # Checkpoint model params can_test = False # Finish testing if args.evaluate: return else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond
def __init__(self, width, height, frame_skip, rewarding_distance, control_magnitude, reward_continuous, render): self.env = JacoEnv(width, height, frame_skip, rewarding_distance, control_magnitude, reward_continuous) self.render = render
from jaco_arm import JacoStackEnv as JacoEnv import mujoco_py import gym import numpy as np import glfw from sklearn.mixture import GaussianMixture as GM import cv2 from sklearn.decomposition import PCA env = JacoEnv() traj_data = np.load('new_stack.npz', allow_pickle=True) obs = traj_data['obs'][:30] acs = traj_data['acs'][:30] ret_save_list = [] #pca = PCA(n_components=3) #nobs = pca.fit_transform(np.vstack(obs)) print(np.vstack(obs).shape) print(obs[0].shape) gm = GM(n_components=3, init_params='random', random_state=0) gm.fit(np.vstack(obs)) for i in range(len(obs)): print('traj [', i, '] :', gm.predict(obs[i])) #np.savez('new_stack.npz', obs = obs[:100], acs=acs[:100], rets=ret_save_list)
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) env.seed(args.seed + rank) # TODO: pass in the observation and action space model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) model.train() t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx = Variable(torch.zeros(1, args.hidden_size)) cx = Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = (0, 0, 0, 0, 0, 0), 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Vs, actions, rewards = [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and value policy, V, (hx, cx) = model(Variable(state[0]), Variable(state[1]), (hx, cx)) # Sample action action = [ p.multinomial().data[0, 0] for p in policy ] # Graph broken as loss for stochastic action calculated manually # Step state, reward, done = env.step(action) state = state_to_tensor(state) done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Save outputs for online training [ arr.append(el) for arr, el in zip((policies, Vs, actions, rewards), ( policy, V, Variable(torch.LongTensor(action)), reward)) ] # Increment counters t += 1 T.increment() # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # R = 0 for terminal s R = Variable(torch.zeros(1, 1)) else: # R = V(s_i; θ) for non-terminal s _, R, _ = model(Variable(state[0]), Variable(state[1]), (hx, cx)) R = R.detach() Vs.append(R) # Train the network _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R)
args.non_rgb_state_size = 18 # 9 joints qpos and qvel TODO: don't hardcode! mp.set_start_method('spawn') torch.manual_seed(args.seed) T = Counter() # Global shared counter # Results dir if not os.path.exists('results'): os.makedirs('results') elif not args.overwrite: raise OSError('results dir exists and overwrite flag not passed') # Create shared network env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) shared_model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size) shared_model.share_memory() if args.model and os.path.isfile(args.model): # Load pretrained weights shared_model.load_state_dict(torch.load(args.model)) # Create optimiser for shared network parameters with shared statistics optimiser = SharedRMSprop( shared_model.parameters(), lr=args.lr, alpha=args.rmsprop_decay) optimiser.share_memory() # Start validation agent
# BLAS setup os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' # Setup args = parser.parse_args() args.env = 'jaco' args.non_rgb_state_size = 18 # 9 joints qpos and qvel TODO: don't hardcode! mp.set_start_method('spawn') torch.manual_seed(args.seed) T = Counter() # Global shared counter # Create shared network env = JacoEnv(args.width, args.height, args.frame_skip, args.rewarding_distance, args.control_magnitude, args.reward_continuous) M = cv2.getRotationMatrix2D((32, 32), 180, 1.) done = False for i in trange(1000): done = False j = 0 while not done: obs, reward, done = env.step( np.random.randint(0, 4, env.num_actuators)) img = cv2.warpAffine(obs[2], M, (64, 64)) cv2.imwrite( "training_observations/obs" + str(i) + "_" + str(j) + ".png", img)