def wait_robot(self): if self.state == 'idle': if self._step < self.skip_step: return if self.state == 'go_obj': if goal_distance(self.robot_state[:2], self.obj_pos[:2]) > self._DIS_ERROR * 2: return elif self.state == 'down': if (goal_distance(self.robot_state[:2], self.obj_pos[:2]) > self._DIS_ERROR or self.robot_state[2] > self.obj_pos[2] + self._DIS_ERROR / 2.0): return elif self.state == 'up': if (goal_distance(self.robot_state[:2], self.tar_pos[:2]) > self._DIS_ERROR * 2 or self.robot_state[2] < self.tar_pos[2] - self._DIS_ERROR / 2.0): return # Done!!! elif self.state == 'go_goal': if goal_distance(self.robot_state[:3], self.goal_pos) > self._DIS_ERROR * 3: return self._done = True elif self.state == 'grip': # print(self._step, self.past_gs, self.robot_state[-1]) if (self._step < self.skip_step or self.robot_state[-1] >= 0.05 or self.past_gs - self.robot_state[-1] > self._DIS_ERROR / 2.0): return self.state = self.next_state self._every_task.append(self._step) self._step = 0
def wait_robot(self): if self.state == 'go_obj': if goal_distance(self.robot_state[:2], self.obj_pos[:2]) > self._DIS_ERROR: # print(goal_distance(self.robot_state[:2], self.obj_pos[:2])) return elif self.state == 'down': if goal_distance(self.robot_state[:3], self.obj_pos) > self._DIS_ERROR: return # Done!!! elif self.state == 'up': if goal_distance(self.robot_state[:3], self.tar_pos) > self._DIS_ERROR: return # TODO: Revise this appoarch to change goal pos self._done = True elif self.state == 'go_goal': if goal_distance(self.robot_state[:3], self.goal_pos) > self._DIS_ERROR: return elif self.state == 'grip': if self.robot_state[-1] >= -.5: return self.state = self.next_state
def compute_reward(self, achieved_goal, goal, info): # Compute distance between goal and the achieved goal. d = goal_distance(achieved_goal, goal) if self.reward_type == 'sparse': return -(d > self.distance_threshold).astype(np.float32) else: return np.exp(-d)
def eval(self, model_name='', random=False): if not random: self.load_weights('pretrained/' + model_name) score = 0 solve_count = 0 tr = tqdm(range(100)) for ep in tr: state = self.env.reset() tr.set_description("Solve percentage: {:.3f}".format(solve_count / (ep + 1))) for t in range(200): if random: a = self.env.action_space.sample() else: a, v = self.call(state['observation']) state, r, done, info = self.env.step(a) d = fetch_env.goal_distance(state['achieved_goal'], state['desired_goal']) done = d <= self.dist_thresh if done: solve_count += 1 break score += r return score / 100.0
def compute_reward(self, achieved_goal, goal, info): reward = 0 completion_reward = 5 # TODO: Make tweakable hyperparam # Compute distance between goal and the achieved goal. d = goal_distance(achieved_goal, goal) reached_goal = d <= self.distance_threshold # add distance reward if self.reward_type == 'sparse': reward = -(not reached_goal).astype(np.float32) else: # dense distance reward reward = -d # add completion reward if reached_goal: reward += completion_reward return reward # def reset(self): # ... # def render(self, mode='human'): # ... # def close(self): # ...
def eval(self, env, model_name='', random=False, render=False): if not random: self.actor.model.load_weights('pretrained/' + model_name + 'Actor.h5') self.critic.model.load_weights('pretrained/' + model_name + 'Critic.h5') score = 0 solve_count = 0 tr = tqdm(range(100)) avg_time = 0 for ep in tr: state = env.reset() for t in range(50): if render: env.render() if random: a = env.action_space.sample() else: a = self.policy_action(self.format_state(state))[0] state, r, done, info = env.step(a) d = goal_distance(state['achieved_goal'], state['desired_goal']) done = d <= 0.05 if done: solve_count += 1 break score += r tr.set_description("Solve percentage: {:.3f}".format(solve_count / (ep + 1))) avg_time += t print("average time to solve:", avg_time / 100.0) return score / 100.0
def step(self, action): action = np.clip(action, self.action_space.low, self.action_space.high) self._set_action(action) self.sim.step() self._step_callback() obs = self._get_obs() done = self._is_success(obs['achieved_goal'], self.goal) info = { 'is_success': done, # does not include done from TimeLimit (episode completion) 'dist': goal_distance(obs['achieved_goal'], self.goal) } reward = self.compute_reward(obs['achieved_goal'], self.goal, info) # Time penalty to encourage faster reaching reward_time = -0.1 # TODO: Make tweakable hyperparam reward = reward + reward_time return obs, reward, done, info
def compute_reward(self, achieved_goal, goal, info): """Compute goal reward""" d = goal_distance(achieved_goal, goal) return (d <= self.distance_threshold).astype(np.float32)
total_reward += r if step % 20 == 0: rgb_obs = env.sim.render(width=200, height=200, camera_name="external_camera_0", depth=False, mode='offscreen', device_id=-1) # rgb_obs1 = env.sim.render(width=200, height=200, camera_name="external_camera_1", depth=False, # mode='offscreen', device_id=-1) plt.figure(1) plt.imshow(rgb_obs) # plt.figure(2) # plt.imshow(rgb_obs1) plt.show(block=False) plt.pause(0.001) if (not upper and goal_distance(obs['eeinfo'][0][:2], obs['achieved_goal'][:2]) < 0.05 and obs['eeinfo'][0][-1] > obs['achieved_goal'][-1] + .01): upper = 1 break if info['is_success'] or done: break # plt.figure(1) # plt.imshow(gif_pic/255.) # plt.figure(2) # plt.imshow(rgb_obs) # plt.show(block=False) # plt.pause(0.001) upper_sucess += upper print(i, "total reward %0.2f. sucess %d rate %.2f" % (total_reward, upper_sucess, upper_sucess / (i+1)))
def _is_success(self, achieved_goal, desired_goal): d = goal_distance(achieved_goal, desired_goal) return (d < self.distance_threshold).astype(np.float32)
def compute_reward(self, achieved_goal, goal, info): # Compute distance between goal and the achieved goal. return -goal_distance(achieved_goal, goal)
def train(self, num_eps=100, render=False, model_start='', model_save='fetchReach.h5', custom_r=False, v_lr=1.0, p_lr=1.0, verbose=1): best_avg_model_name = 'best_avg-' + model_save self.p_opt = tf.train.RMSPropOptimizer(learning_rate=p_lr, epsilon=0.1) # self.p_opt = tf.train.AdamOptimizer(learning_rate=p_lr) # self.p_opt = tf.train.GradientDescentOptimizer(learning_rate=p_lr) # self.v_opt = tf.train.AdamOptimizer(learning_rate=v_lr) # self.load_weights('mtnCar.h5') if model_start: self.load_weights(model_start) self.num_eps = num_eps best_r = -float('inf') best_avg = -float('inf') num_steps = 200 avg_r_ep = 0 prev_actions = np.zeros(4, dtype=np.float64) if verbose == 0: ep_iter = tqdm(range(self.num_eps)) else: ep_iter = range(self.num_eps) solve_count = 0 for ep in ep_iter: tr = range(num_steps) if verbose == 1: tr = tqdm(tr) # tr = tqdm(itertools.count()) state = self.env.reset() # self.env.distance_threshold = 5 total_r = 0 avg_value = 0 actions = np.zeros(4, dtype=np.float64) avg = 0 for t in tr: with tf.GradientTape(persistent=True) as tape: a, v = self.call(state['observation']) # a = list(a.numpy()) + [0] actions += a if render: self.env.render() next_state, r, done, info = self.env.step(a) d = fetch_env.goal_distance(next_state['achieved_goal'], next_state['desired_goal']) done = d <= self.dist_thresh and t > 1 if custom_r: r = (self.dist_thresh / (d + 1e-6)) r = min(r, 1.0) if done: solve_count += 1 print('solved') if custom_r: r += 5 total_r += r avg_value += v.numpy() td_target = r + 0.99 * self.call( next_state['observation'])[1] td_error = td_target - v vloss = self.get_value_loss(td_target) ploss = tf.reduce_mean(self.get_policy_loss(td_error)) self.loss = vloss + ploss # if custom_r: # self.loss *= -1 self.update(tape) # grads = self.get_grads(tape, td_error, td_target) # self.optimizer.apply_gradients(zip(grads, self.weights)) # "Frame Reward {:.3f} | " if verbose == 1: tr.set_description("Ep {}/{} | " "Loss {:.3f} | " "Total Reward {:.3f} | " "Avg Value {:.3f} | " "Solve Ratio {:.3f} | " "Avg Reward/Epoch {:.3f}".format( ep + 1, self.num_eps, self.loss, total_r, avg_value / (t + 1), solve_count / (ep + 1), avg_r_ep)) # "Avg Reward {:.3f} | " if done: break state = next_state if verbose == 0: ep_iter.set_description("Solve percentage: {:.3f}".format( solve_count / ep)) if done and total_r < avg: total_r = avg + 5 if avg_r_ep == 0: avg_r_ep = total_r else: avg_r_ep = avg_r_ep * 0.99 + total_r * 0.01 if fetch_env.goal_distance(actions, prev_actions) < 0.1: print('Possible error: actions same as previous state {}\n'. format(actions / num_steps)) prev_actions = actions avg = avg_r_ep #avg_r_ep / (ep + 1) if avg >= best_avg and ep > 10: print( f'\nSaving best average model with reward of {avg} to {best_avg_model_name}' ) best_avg = avg self.save_weights('pretrained/' + best_avg_model_name) if total_r >= best_r: print( f'\nSaving best model with reward of {total_r} to {model_save}' ) best_r = total_r self.best_weights = self.weights self.save_weights('pretrained/' + model_save) self.save_weights('pretrained/last_' + model_save)
def train(self, env, args): results = [] num_steps = 200 # First, gather experience tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit="episode") avg_r_ep = 0 best_avg = -float('inf') best_score = -float('inf') past_samples = 15 hist_ratio = deque(maxlen=past_samples) hist_scores = deque(maxlen=past_samples) for e in tqdm_e: noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.act_dim)) # Reset episode time, cumul_reward, done = 0, 0, False s = env.reset() # noise = OrnsteinUhlenbeckProcess(size=self.act_dim) for _ in range(num_steps): if args.render: env.render() # Actor picks an action (following the deterministic policy) old_state = self.format_state(s) # print(old_state.shape) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a + noise(), -self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal a = np.squeeze(a) new_state, r, done, info = env.step(a) dist = goal_distance(new_state['achieved_goal'], new_state['desired_goal']) # new_state = new_state['observation'] # Add outputs to memory buffer self.store_states(s, a, r, done, info, new_state) s = new_state cumul_reward += r # Sample experience from buffer states, actions, rewards, dones, new_states = self.sample_batch( args.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state if done: break if avg_r_ep == 0: avg_r_ep = cumul_reward else: avg_r_ep = avg_r_ep * 0.99 + cumul_reward * 0.01 if avg_r_ep >= best_avg: best_avg = avg_r_ep self.actor.model.save_weights( 'pretrained/best_avg_ddpgActor.h5') self.critic.model.save_weights( 'pretrained/best_avg_ddpgCritic.h5') # Display score if cumul_reward >= best_score: best_score = cumul_reward self.actor.model.save_weights('pretrained/ddpgActor.h5') self.critic.model.save_weights('pretrained/ddpgCritic.h5') hist_ratio.append(int(dist <= 0.05)) hist_scores.append(cumul_reward) tqdm_e.set_description( "Score: {} | " "Best Reward: {} (avg: {:.2f})| " "Avg Reward, solve ratio over last {} samples: {:.3f}, {:.3f}". format(cumul_reward, np.amax(hist_scores), avg_r_ep, past_samples, np.mean(hist_scores), np.mean(hist_ratio))) tqdm_e.refresh() return results