def init_demo_buffer(self, demoDataFile, update_stats=True): # function that initializes the demo buffer demoData = np.load(demoDataFile) # load the demonstration data from data file info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')] info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys] demo_data_obs = demoData['obs'] demo_data_acs = demoData['acs'] demo_data_info = demoData['info'] for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training obs, acts, goals, achieved_goals = [], [], [], [] i = 0 for transition in range(self.T - 1): obs.append([demo_data_obs[epsd][transition].get('observation')]) acts.append([demo_data_acs[epsd][transition]]) goals.append([demo_data_obs[epsd][transition].get('desired_goal')]) achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')]) for idx, key in enumerate(info_keys): info_values[idx][transition, i] = demo_data_info[epsd][transition][key] obs.append([demo_data_obs[epsd][self.T - 1].get('observation')]) achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = convert_episode_to_batch_major(episode) global DEMO_BUFFER DEMO_BUFFER.store_episode( episode) # create the observation dict and append them into the demonstration buffer logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) # print out the demonstration buffer size if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch(episode) transitions = self.sample_transitions(episode, num_normalizing_transitions) o, g, ag = transitions['o'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear() logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) # print out the demonstration buffer size
def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer demoData = np.load(demoDataFile) #load the demonstration data from data file info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')] info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys] demo_data_obs = demoData['obs'] demo_data_acs = demoData['acs'] demo_data_info = demoData['info'] for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training obs, acts, goals, achieved_goals = [], [] ,[] ,[] i = 0 for transition in range(self.T - 1): obs.append([demo_data_obs[epsd][transition].get('observation')]) acts.append([demo_data_acs[epsd][transition]]) goals.append([demo_data_obs[epsd][transition].get('desired_goal')]) achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')]) for idx, key in enumerate(info_keys): info_values[idx][transition, i] = demo_data_info[epsd][transition][key] obs.append([demo_data_obs[epsd][self.T - 1].get('observation')]) achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = convert_episode_to_batch_major(episode) global DEMO_BUFFER DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch(episode) transitions = self.sample_transitions(episode, num_normalizing_transitions) o, g, ag = transitions['o'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear() logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ directory_plot = '../shadow-hand-obervation-plot/shadow-hand-obs-png/' + datetime.datetime.now( ).strftime("%m%d_%H%M%S") + os.sep directory_env = '../shadow-hand-observation-env/shadow-hand-env-png/' + datetime.datetime.now( ).strftime("%m%d_%H%M%S") + os.sep self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [ np.empty( (self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] x_bar = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 ] x_lab = [ "WR-J1-qpos", "WR-J0-qpos", "FF-J3-qpos", "FF-J2-qpos", "FF-J1-qpos", "FF-J0-qpos", "MF-J3-qpos", "MF-J2-qpos", "MF-J1-qpos", "MF-J0-qpos", "RF-J3-qpos", "RF-J2-qpos", "RF-J1-qpos", "RF-J0-qpos", "LF-J4-qpos", "LF-J3-qpos", "LF-J2-qpos", "LF-J1-qpos", "LF-J0-qpos", "TH-J4-qpos", "TH-J3-qpos", "TH-J2-qpos", "TH-J1-qpos", "TH-J0-qpos", "WR-J1-qvel", "WR-J0-qvel", "FF-J3-qvel", "FF-J2-qvel", "FF-J1-qvel", "FF-J0-qvel", "MF-J3-qvel", "MF-J2-qvel", "MF-J1-qvel", "MF-J0-qvel", "RF-J3-qvel", "RF-J2-qvel", "RF-J1-qvel", "RF-J0-qvel", "LF-J4-qvel", "LF-J3-qvel", "LF-J2-qvel", "LF-J1-qvel", "LF-J0-qvel", "TH-J4-qvel", "TH-J3-qvel", "TH-J2-qvel", "TH-J1-qvel", "TH-J0-qvel", "object_qvel-0", "object_qvel-1", "object_qvel-2", "object_qvel-3", "object_qvel-4", "object_qvel-5", "achieved_goal-0", "achieved_goal-1", "achieved_goal-2", "achieved_goal-3", "achieved_goal-4", "achieved_goal-5", "achieved_goal-6" ] # List set used for appending values from eposide observation_catcher = [] observation_catcher_1 = [] observation_catcher_2 = [] observation_catcher_3 = [] observation_catcher_4 = [ ] # this is the max append used for observation parameters qpos and qvel observation_catcher_5 = [ ] # this is the max append used for observation parameter object qvel observation_catcher_6 = [ ] # this is the max append used for observation parameter achieved goal # List set used for appending values from csv file observation_catcher_f0 = [] observation_catcher_f1 = [] observation_catcher_f2 = [] observation_catcher_f3 = [] observation_catcher_f4 = [ ] # this is the max append used for observation parameters qpos and qvel from csv file observation_catcher_f5 = [ ] # this is the max append used for observation parameter object qvel from csv file observation_catcher_f6 = [ ] # this is the max append used for observation parameter achieved goal from csv file for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) #u_check = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] # used to check the observation value changes --> replace in step with u_check instead of u[i] # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step( u[i]) #u[i] & u_check if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() elif self.rendder_and_save_png: #ndrw rgb_array = self.envs[i].render(mode='rgb_array') im = Image.fromarray(rgb_array) lov = im.crop( (230, 180, 780, 730) ) # the crop setting needed to be changed as per the direction and the required parameters are present in resource file. observation_catcher.append( o_new[i][0] ) # use to append the requried set values from the observation vector (0-60) observation_catcher_1.append(o_new[i][1]) observation_catcher_2.append(o_new[i][2]) observation_catcher_3.append(o_new[i][3]) observation_catcher_4.append(o_new[i][4]) observation_catcher_5.append(o_new[i][5]) observation_catcher_6.append(o_new[i][6]) #this place to read csv file which has all the observation space values of shadow-hand with open('two_finger_ac_values/foo_0.csv', 'r') as readcsv: plots = csv.reader(readcsv, delimiter=',') for row in plots: observation_catcher_f0.append( float(row[0]) ) # use to append the requried set values from the observation vector (0-60) csv file observation_catcher_f1.append(float(row[1])) observation_catcher_f2.append(float(row[2])) observation_catcher_f3.append(float(row[3])) observation_catcher_f4.append(float(row[4])) observation_catcher_f5.append(float(row[5])) observation_catcher_f6.append(float(row[6])) ax1.clear() ax1.axvline(len(observation_catcher) - 1, ymin=-1, ymax=1, color='k', linestyle=':', linewidth=3) # marker line for each step ax1.plot(observation_catcher_f0, color='xkcd:coral', linewidth=4, label="achieved_goal-0" ) # full curve for all steps ax1.plot(observation_catcher_f1, color='xkcd:green', linewidth=4, label="achieved_goal-1") ax1.plot(observation_catcher_f2, color='xkcd:goldenrod', linewidth=4, label="achieved_goal-2") ax1.plot(observation_catcher_f3, color='xkcd:orchid', linewidth=4, label="achieved_goal-3") ax1.plot(observation_catcher_f4, color='xkcd:azure', linewidth=4, label="achieved_goal-4") ax1.plot(observation_catcher_f5, color='xkcd:orangered', linewidth=4, label="achieved_goal-5") ax1.plot(observation_catcher_f6, color='xkcd:tan', linewidth=4, label="achieved_goal-6") ax1.plot( observation_catcher, 'o', color='xkcd:coral', markevery=[-1], markersize=10, markeredgecolor='k') # ball marker for each step ax1.plot(observation_catcher_1, 'o', color='xkcd:green', markevery=[-1], markersize=10, markeredgecolor='k') ax1.plot(observation_catcher_2, 'o', color='xkcd:goldenrod', markevery=[-1], markersize=10, markeredgecolor='k') ax1.plot(observation_catcher_3, 'o', color='xkcd:orchid', markevery=[-1], markersize=10, markeredgecolor='k') ax1.plot(observation_catcher_4, 'o', color='xkcd:azure', markevery=[-1], markersize=10, markeredgecolor='k') ax1.plot(observation_catcher_5, 'o', color='xkcd:orangered', markevery=[-1], markersize=10, markeredgecolor='k') ax1.plot(observation_catcher_6, 'o', color='xkcd:tan', markevery=[-1], markersize=10, markeredgecolor='k') ax1.set_xlabel('Time-Step', fontsize=15) ax1.set_ylabel('Observation-Values', fontsize=15) ax1.set_title( 'Observation Vector Of The Shadow Hand (NN-Input)', fontsize=18, loc="left") ax1.legend(loc='upper right', facecolor='#74dd93', frameon=False, fontsize='large', ncol=3, bbox_to_anchor=(1.03, 1.27)) ax1.set_facecolor('#74dd93') ax1.set_xlim(xmin=-1) ax1.set_xlim(xmax=99) #ax1.set_ylim(ymin=-1.05) # default value --> should be checked according the y min in observed value - hard coded #ax1.set_ylim(ymax=1.1) # default value --> should be checked according the y max in observed value - hard coded ax2.clear() barlist = ax2.bar(x_bar, color='xkcd:silver', width=0.6, height=0.025) barlist[0].set_color('xkcd:coral') barlist[1].set_color('xkcd:green') barlist[2].set_color('xkcd:goldenrod') barlist[3].set_color('xkcd:orchid') barlist[4].set_color('xkcd:azure') barlist[5].set_color('xkcd:orangered') barlist[6].set_color('xkcd:tan') ax2.set_yticklabels([]) ax2.set_xticks(x_bar) ax2.set_xticklabels(x_lab, rotation=90, fontsize=11) ax2.set_facecolor('#74dd93') ax2.set_frame_on(False) ax2.axes.get_yaxis().set_visible(False) if not os.path.exists(directory_plot): os.makedirs(directory_plot) if not os.path.exists(directory_env): os.makedirs(directory_env) plt.savefig(directory_plot + "pic_{0:05d}.png".format(t), facecolor=fig.get_facecolor(), edgecolor='none') lov.save(directory_env + "pic_{0:05d}.png".format(t)) except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o #this below code snap is helping to write the observation values to csv file using python zip and csv module combination #with open('two_finger_ac_values/foo_0.csv','w') as csvfile: #values = csv.writer(csvfile) #values.writerows(zip(observation_catcher,observation_catcher_1,observation_catcher_2,observation_catcher_3,observation_catcher_4,observation_catcher_5,observation_catcher_6,observation_catcher_7,observation_catcher_8,observation_catcher_9,observation_catcher_10,observation_catcher_11,observation_catcher_12,observation_catcher_13,observation_catcher_14,observation_catcher_15,observation_catcher_16,observation_catcher_17,observation_catcher_18,observation_catcher_19,observation_catcher_20,observation_catcher_21,observation_catcher_22,observation_catcher_23,observation_catcher_24,observation_catcher_25,observation_catcher_26,observation_catcher_27,observation_catcher_28,observation_catcher_29,observation_catcher_30,observation_catcher_31,observation_catcher_32,observation_catcher_33,observation_catcher_34,observation_catcher_35,observation_catcher_36,observation_catcher_37,observation_catcher_38,observation_catcher_39,observation_catcher_40,observation_catcher_41,observation_catcher_42,observation_catcher_43,observation_catcher_44,observation_catcher_45,observation_catcher_46,observation_catcher_47,observation_catcher_48,observation_catcher_49,observation_catcher_50,observation_catcher_51,observation_catcher_52,observation_catcher_53,observation_catcher_54,observation_catcher_55,observation_catcher_56,observation_catcher_57,observation_catcher_58,observation_catcher_59,observation_catcher_60)) #csvfile.close() episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def initDemoBuffer(self, demoDataFile, update_stats=True): demoData = np.load(demoDataFile) info_keys = [ key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_') ] info_values = [ np.empty((self.T, self.rollout_batch_size, self.input_dims['info_' + key]), np.float32) for key in info_keys ] for epsd in range(self.num_demo): obs, acts, goals, achieved_goals = [], [], [], [] i = 0 for transition in range(self.T): obs.append( [demoData['obs'][epsd][transition].get('observation')]) acts.append([demoData['acs'][epsd][transition]]) goals.append( [demoData['obs'][epsd][transition].get('desired_goal')]) achieved_goals.append( [demoData['obs'][epsd][transition].get('achieved_goal')]) for idx, key in enumerate(info_keys): info_values[idx][ transition, i] = demoData['info'][epsd][transition][key] obs.append([demoData['obs'][epsd][self.T].get('observation')]) achieved_goals.append( [demoData['obs'][epsd][self.T].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = convert_episode_to_batch_major(episode) global demoBuffer demoBuffer.store_episode(episode) print("Demo buffer size currently ", demoBuffer.get_current_size()) if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch( episode) transitions = self.sample_transitions( episode, num_normalizing_transitions) o, o_2, g, ag = transitions['o'], transitions[ 'o_2'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og( o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear()
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o.astype(np.float32) ag[:] = self.initial_ag.astype(np.float32) #Add initial states as "achieved goals" hashcode = self.countTracker.compute_hash_code( self.initial_ag[0].astype(np.float32)) self.countTracker.update_count(hashcode) if len(self.initial_ag) > 1: hashcode_2 = self.countTracker.compute_hash_code( self.initial_ag[1].astype(np.float32)) self.countTracker.update_count(hashcode_2) # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [ np.empty( (self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] #time horizon = number of states achieved (50), subtracting off initial state #in grand scheme of things, should include initial state as an achieved goal, so should be 51 for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) #rollout_batch size by default is 2, dimensions of goal is 3 o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. #self.envs[i].step(u[i]) contains all of the environment feedback #The above statement returns an observation key with all observation #information (multiple values), an achieved goal key with a 3-D point, a #desired goal key with a 3-D point, and an is_success key with a boolean value curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'].astype(np.float32) ag_new[i] = curr_o_new['achieved_goal'].astype(np.float32) hashcode = self.countTracker.compute_hash_code( curr_o_new['achieved_goal'].astype(np.float32)) self.countTracker.update_count(hashcode) for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # print("Rollout. o_new={}, ag_new={},success={}".format(o_new,ag_new,success)) # compute new states and observations obs_dict_new, _, done, info = self.venv.step(u) # print("HERE") # print("#########Debug##########") o_new = obs_dict_new['observation'] # print("observation high : {}".format(o_new)) ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) # print("############## obs = {}".format(obs)) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes, successes2, successes3 = [], [], [], [], [], [], [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] Fs = [] Ks = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) Fs.append( np.abs( np.float32((o[:, 11:13] * (o[:, 11:13] < 0.0)).sum( axis=-1))).mean()) # block # Fs.append(np.abs(np.float32([e.env.prev_oforce for e in self.venv.envs])).mean()) # chip # Ks.append(np.abs(np.float32(o[:,13].sum(axis=-1))).mean()) # block 6D Ks.append(0.25) # block 4D, chip 3D # Ks.append(np.abs(np.float32(o[:,14].sum(axis=-1))).mean()) # chip 5D else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) success2 = np.zeros(self.rollout_batch_size) # compute new states and observations obs_dict_new, _, done, info = self.venv.step(u) # self.venv.render() o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) success2 = (np.float32(o[:, 11:13].sum(axis=-1)) * 1000.0 > -300.0 ) # block -147.15/3*6 # success2 = (np.float32(self.venv.envs[0].env.prev_oforce < self.venv.envs[0].env.object_fragility)) # chip if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) successes2.append(success2.copy()) # successes3.append(success3.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new # print("--------------------New Rollout--------------------") obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] successful2 = np.array(successes2) assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) success_rate2 = np.mean(successful2.mean(axis=0)) success_rate3 = np.mean(successful2.min(axis=0) * successful) self.success_history.append(success_rate) self.success_history2.append(success_rate2) self.success_history3.append(success_rate3) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.F_history.append(np.mean(Fs)) self.K_history.append(np.mean(Ks)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [ np.empty( (self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): rewards = [] infos = [] try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, reward, _, info = self.envs[i].step(u[i]) rewards.append(reward) infos.append(info) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] env = self.envs[i] # type: PushEnv env.render_camera = "camera_side" img_left = env.render(mode='rgb_array') env.render_camera = "camera_topdown" img_center = env.render(mode='rgb_array') # render rewards img_center[-lower_part, :10] = orange img_center[-lower_part, -10:] = orange if TRAJ_SIZE < 512: p_rew_x = 0 for j, r in enumerate(rewards): rew_x = int(j * width_factor) if r < 0: color = blue if infos[j]["grasped"] else red img_center[-1:, p_rew_x:rew_x] = color img_center[-1:, p_rew_x:rew_x] = color else: rew_y = int(r / max_reward * lower_part) color = blue if infos[j]["grasped"] else orange img_center[-rew_y - 1:, p_rew_x:rew_x] = color img_center[-rew_y - 1:, p_rew_x:rew_x] = color p_rew_x = rew_x else: for j, r in enumerate(rewards): rew_x = int(j * width_factor) if r < 0: color = blue if infos[j]["grasped"] else red img_center[-1:, rew_x] = color img_center[-1:, rew_x] = color else: rew_y = int(r / max_reward * lower_part) color = blue if infos[j]["grasped"] else orange img_center[-rew_y - 1:, rew_x] = color img_center[-rew_y - 1:, rew_x] = color env.render_camera = "camera_front" img_right = env.render(mode='rgb_array') img_left = Image.fromarray(np.uint8(img_left)) draw_left = ImageDraw.Draw(img_left) draw_left.text((20, 20), "Batch %i" % (i + 1), fill="black", font=font) img_right = Image.fromarray(np.uint8(img_right)) draw_right = ImageDraw.Draw(img_right) draw_right.text((20, 20), "Step %i" % info["l"], fill="black", font=font) self.video.append( np.hstack((np.array(img_left), np.array(img_center), np.array(img_right)))) except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size imageio.mimsave('play_her.mp4', self.video, fps=20) return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes, successes2 = [], [], [], [], [], [] dones = [] info_values = [np.empty((self.T-1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] Fs = [] Ks = [] with Session(bitfile="SCPC-lv-noFIFO_FPGATarget_FPGAmainepos_1XvgQEcJVeE.lvbitx", resource="rio://10.157.23.150/RIO0") as session: act_Rj1=session.registers['Mod3/AO0'] enc_Rj1=session.registers['Rj1'] act_Rj2=session.registers['Mod3/AO1'] enc_Rj2=session.registers['Rj2'] act_Lj1=session.registers['Mod3/AO3'] enc_Lj1=session.registers['Lj1'] act_Lj2=session.registers['Mod3/AO4'] enc_Lj2=session.registers['Lj2'] sen_f=session.registers['fsensor'] sen_e=session.registers['fencoder'] emergency = False Re1 = enc_Rj1.read() Re2 = enc_Rj2.read() Le1 = enc_Lj1.read() Le2 = enc_Lj2.read() f_sensor = 5.1203 * sen_f.read() - 5.2506 e_sensor = (((sen_e.read()) - (f_sensor / 100.0 * 0.15)) -2.9440)/0.0148 Rj = self.R_j_inv * self.R_e * np.array([[Re1-self.offset[0]],[-Re2+self.offset[1]]]) * np.pi/180.0 Lj = self.R_j_inv_L * self.R_e * np.array([[Le1-self.offset[2]],[Le2-self.offset[3]]]) * np.pi/180.0 Prev_Rj = Rj Prev_Lj = Lj xR = self.L1 * np.cos(Rj[0,0] + np.pi/2.0) + self.L2 * np.cos(Rj[0,0]-Rj[1,0] + np.pi/2.0) yR = self.L1 * np.sin(Rj[0,0] + np.pi/2.0) + self.L2 * np.sin(Rj[0,0]-Rj[1,0] + np.pi/2.0) xL = self.L1 * np.cos(Lj[0,0] + np.pi/2.0) + self.L2 * np.cos(Lj[0,0]+Lj[1,0] + np.pi/2.0) yL = self.L1 * np.sin(Lj[0,0] + np.pi/2.0) + self.L2 * np.sin(Lj[0,0]+Lj[1,0] + np.pi/2.0) P_R = np.array([xR, yR]) P_L = np.array([xL, yL]) Prel_R = self.Pc_R - P_R Prel_L = self.Pc_L - P_L l_R = np.sqrt(Prel_R[0]*Prel_R[0] + Prel_R[1]*Prel_R[1]) l_L = np.sqrt(Prel_L[0]*Prel_L[0] + Prel_L[1]*Prel_L[1]) p_R = np.array([[l_R],[np.arctan2(-Prel_R[1],-Prel_R[0])]]) p_L = np.array([[l_L],[np.arctan2(Prel_L[1],Prel_L[0])]]) for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) Fs.append(f_sensor) Ks.append(self.stiffness) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) success2 = np.zeros(self.rollout_batch_size) # compute new states and observations self.stiffness_lim = np.clip(self.stiffness_lim + 0.2 * u[0][3], 0.1, 1.0) self.stiffness = np.clip(self.stiffness + 0.2 * u[0][2], 0, self.stiffness_lim) u[0][0] = np.clip(u[0][0], -self.l_step_limit*14.0, self.l_step_limit*14.0) u[0][1] = np.clip(u[0][1], -8.0*self.th_step_limit*2.0, self.th_step_limit*2.0) if emergency == False: vel_R = Rj - Prev_Rj vel_L = Lj - Prev_Lj if vel_R[0,0] > self.vel_limit or vel_R[0,0] < -self.vel_limit or vel_L[0,0] > self.vel_limit or vel_L[0,0] < -self.vel_limit: emergency = True r = np.array([[self.stiffness], [1.0]]) * 0.5 print("***************Robot going insane! Safety on!***************") else: r = np.array([[self.stiffness], [1.0]]) else: r = np.array([[self.stiffness], [1.0]]) * 0.5 des_p_R = np.array([[np.min([np.max([p_R[0,0] + u[0][0]/14.0, -self.l_limit/2.0]), self.l_limit/4.0])], [np.min([np.max([p_R[1,0] + u[0][1]/2.0, -self.th_limit]), self.th_limit])]]) des_p_L = np.array([[np.min([np.max([p_L[0,0] + u[0][0]/14.0, -self.l_limit/2.0]), self.l_limit/4.0])], [np.min([np.max([p_L[1,0] + u[0][1]/2.0, -self.th_limit]), self.th_limit])]]) Jp_R = np.matrix([[-Prel_R[0]/l_R, -Prel_R[1]/l_R],[Prel_R[1]/l_R/l_R, -Prel_R[0]/l_R/l_R]]) Jp_L = np.matrix([[-Prel_L[0]/l_L, -Prel_L[1]/l_L],[Prel_L[1]/l_L/l_L, -Prel_L[0]/l_L/l_L]]) Jp_inv_R = np.matrix([[Jp_R[1,1] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0]), -Jp_R[0,1] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0])], [-Jp_R[1,0] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0]), Jp_R[0,0] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0])]]) Jp_inv_L = np.matrix([[Jp_L[1,1] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0]), -Jp_L[0,1] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0])], [-Jp_L[1,0] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0]), Jp_L[0,0] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0])]]) J_R = np.matrix([[-yR, self.L2 * np.cos(Rj[0,0]-Rj[1,0])], [xR, self.L2 * np.sin(Rj[0,0]-Rj[1,0])]]) J_L = np.matrix([[-yL, -self.L2 * np.cos(Lj[0,0]+Lj[1,0])], [xL, -self.L2 * np.sin(Lj[0,0]+Lj[1,0])]]) J_inv_R = np.matrix([[J_R[1,1] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0]), -J_R[0,1] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0])], [-J_R[1,0] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0]), J_R[0,0] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0])]]) J_inv_L = np.matrix([[J_L[1,1] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0]), -J_L[0,1] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0])], [-J_L[1,0] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0]), J_L[0,0] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0])]]) max_kj_R = np.transpose(self.R_j) * np.matrix([[2*self.Ksc, 0],[0, 2*self.Ksc]]) * self.R_j max_kj_L = np.transpose(self.R_j_L) * np.matrix([[2*self.Ksc, 0],[0, 2*self.Ksc]]) * self.R_j_L max_k_R = np.transpose(J_inv_R) * max_kj_R * J_inv_R max_k_L = np.transpose(J_inv_L) * max_kj_L * J_inv_L max_kp_R = np.transpose(Jp_inv_R) * max_k_R * Jp_inv_R max_kp_L = np.transpose(Jp_inv_L) * max_k_L * Jp_inv_L max_kp_R[0,1] = 0.0 max_kp_R[1,0] = 0.0 max_kp_L[0,1] = 0.0 max_kp_L[1,0] = 0.0 des_Fp_R = max_kp_R * (r * (des_p_R - p_R)) * 0.9 des_Fp_L = max_kp_L * (r * (des_p_L - p_L)) * 0.9 des_F_R = np.transpose(Jp_R) * des_Fp_R des_F_L = np.transpose(Jp_L) * des_Fp_L des_tau_R = np.transpose(J_R) * des_F_R des_tau_L = np.transpose(J_L) * des_F_L if Rj[1,0] > -0.2: des_tau_R += np.array([[0.0],[-0.05]]) if Lj[1,0] > -0.2: des_tau_L += np.array([[0.0],[-0.05]]) if Rj[1,0] < -1.8: des_tau_R += np.array([[0.0],[0.05]]) if Lj[1,0] < -1.8: des_tau_L += np.array([[0.0],[0.05]]) if Rj[0,0] > 0: des_tau_R += np.array([[-0.05],[0.0]]) if Lj[0,0] < 0: des_tau_L += np.array([[0.05],[0.0]]) if Rj[0,0] < -0.8: des_tau_R += np.array([[0.05],[0.0]]) if Lj[0,0] > 0.8: des_tau_L += np.array([[-0.05],[0.0]]) des_mR = (np.transpose(self.R_j_inv)*des_tau_R / (2*self.Ksc) + self.R_j * Rj) / self.Rm des_mL = (np.transpose(self.R_j_inv_L)*des_tau_L / (2*self.Ksc) + self.R_j_L * Lj) / self.Rm act_Rj1.write(np.min([np.max([des_mR[0,0] * 180.0 / np.pi * 0.117258, -10.0]),10.0])) act_Rj2.write(np.min([np.max([des_mR[1,0] * 180.0 / np.pi * 0.117541, -10.0]),10.0])) act_Lj1.write(np.min([np.max([des_mL[0,0] * 180.0 / np.pi * 0.117729, -10.0]),10.0])) act_Lj2.write(np.min([np.max([des_mL[1,0] * 180.0 / np.pi * 0.117679, -10.0]),10.0])) time.sleep(0.004) Re1 = enc_Rj1.read() Re2 = enc_Rj2.read() Le1 = enc_Lj1.read() Le2 = enc_Lj2.read() f_sensor = 5.1203 * sen_f.read() - 5.2506 e_sensor = (((sen_e.read()) - (f_sensor / 100.0 * 0.15)) -2.9440)/0.0148 Prev_Rj = Rj Prev_Lj = Lj Rj = self.R_j_inv * self.R_e * np.array([[Re1-self.offset[0]],[-Re2+self.offset[1]]]) * np.pi/180.0 Lj = self.R_j_inv_L * self.R_e * np.array([[Le1-self.offset[2]],[Le2-self.offset[3]]]) * np.pi/180.0 xR = self.L1 * np.cos(Rj[0,0] + np.pi/2.0) + self.L2 * np.cos(Rj[0,0]-Rj[1,0] + np.pi/2.0) yR = self.L1 * np.sin(Rj[0,0] + np.pi/2.0) + self.L2 * np.sin(Rj[0,0]-Rj[1,0] + np.pi/2.0) xL = self.L1 * np.cos(Lj[0,0] + np.pi/2.0) + self.L2 * np.cos(Lj[0,0]+Lj[1,0] + np.pi/2.0) yL = self.L1 * np.sin(Lj[0,0] + np.pi/2.0) + self.L2 * np.sin(Lj[0,0]+Lj[1,0] + np.pi/2.0) P_R = np.array([xR, yR]) P_L = np.array([xL, yL]) Prel_R = self.Pc_R - P_R Prel_L = self.Pc_L - P_L l_R = np.sqrt(Prel_R[0]*Prel_R[0] + Prel_R[1]*Prel_R[1]) l_L = np.sqrt(Prel_L[0]*Prel_L[0] + Prel_L[1]*Prel_L[1]) p_R = np.array([[l_R],[np.arctan2(-Prel_R[1],-Prel_R[0])]]) p_L = np.array([[l_L],[np.arctan2(Prel_L[1],Prel_L[0])]]) observation = np.array([[p_R[0,0] * 10 - 1.0, p_L[0,0] * 10 - 1.0, p_R[1,0], p_L[1,0], ((e_sensor -2.9440)/0.0148* np.pi / 180.0 - p_R[1,0]) , ((e_sensor -2.9440)/0.0148* np.pi / 180.0 - p_L[1,0]), (self.g[0][0] - (e_sensor -2.9440)/0.0148 * np.pi / 180.0), des_Fp_R[0,0] * 0.1, des_Fp_L[0,0] * 0.1, vel_R[0,0], vel_R[1,0], vel_L[0,0], vel_L[1,0], self.stiffness, self.stiffness_lim]]) obs_dict_new = dict(observation=observation, achieved_goal=np.array([[((e_sensor -2.9440)/0.0148) * np.pi / 180.0, vel_R[0,0], vel_R[1,0], vel_L[0,0], vel_L[1,0], des_Fp_R[0,0] * 0.1, des_Fp_L[0,0] * 0.1]]), desired_goal = self.g) done = [False] if t < self.T-1 else [True] info = [{ 'is_success': self._is_success(obs_dict_new['achieved_goal'][0], obs_dict_new['desired_goal'][0]), }] o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) success2 = (np.float32(f_sensor < self.object_fragility)) if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): # print(info_values[idx][t, i]) # print(info[i][key]) # print(info_values) # print(info) info_values[idx][t, i] = info[i][key] # if np.isnan(o_new).any(): # self.logger.warn('NaN caught during rollout generation. Trying again...') # self.reset_all_rollouts() # return self.generate_rollouts() dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) successes2.append(success2.copy()) # print(o.copy()) # print(o_new) # successes3.append(success3.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new # print("--------------------New Rollout--------------------") obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] successful2 = np.array(successes2) assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) success_rate2 = np.mean(successful2.mean(axis=0)) success_rate3 = np.mean(successful2.min(axis=0) * successful) self.success_history.append(success_rate) self.success_history2.append(success_rate2) self.success_history3.append(success_rate3) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.F_history.append(np.mean(Fs)) self.K_history.append(np.mean(Ks)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts_ker(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() episodes = [] episodes_batch = [] # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations, do not return the reward, and get it from her_sampler.py obs_dict_new, _, done, info = self.venv.step(u) o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) # no need if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break # no need for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] # no need if np.isnan(o_new).any(): self.logger.warn('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) # # ----------------Kaleidoscope ER--------------------------- # original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals) # # ----------------end--------------------------- # # ----------------pack up as transition--------------------------- # for (obs,acts,goals,achieved_goals) in original_ka_episodes: # episode = dict(o=obs, # u=acts, # g=goals, # ag=achieved_goals) # for key, value in zip(self.info_keys, info_values): # episode['info_{}'.format(key)] = value # episodes.append(episode) # # ----------------end--------------------------- n_KER = None if self.dynamic_KER: # set_trace() assert self.dynamic_KER <10000 and self.dynamic_KER > 10 n_KER_1 = self.dynamic_KER%10 n_KER_2 = self.dynamic_KER//10 assert n_KER_1!=0 and n_KER_2!=0 n_KER = n_KER_1 ag = np.array(achieved_goals) delta_movement = np.linalg.norm(ag[1:] - ag[0], axis=2) # compare with the object starting pos if any(delta_movement > 0.05): # if the object is moved # set_trace() self.count_ray +=1 # print('move the ag') print('move the ag:', self.count_ray) n_KER = n_KER_2 # print('xag:',x) # print('yag:',y) # print('g:',self.g) # print('successes:',successes) else: # ******************print # set_trace() ag = np.array(achieved_goals) delta_movement = np.linalg.norm(ag[1:] - ag[0], axis=2) # compare with the object starting pos if any(delta_movement > 0.05): # if the object is moved self.count_ray += 1 print('move the ag:', self.count_ray) # ******************print original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals,n_KER) # ----------------end--------------------------- # ----------------pack up as transition--------------------------- for (obs,acts,goals,achieved_goals) in original_ka_episodes: episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value episodes.append(episode) # if self.dynamic_mirror_origin == 'True': # untested # temp_trajs = [] # step = 5 # for i in range(step): # # ----------------Kaleidoscope ER--------------------------- # original_ka_episodes = self.ker.ker_process(obs, acts, goals, achieved_goals, n_KER, step=i) # temp_trajs.append(original_ka_episodes) # # ----------------end--------------------------- # # for temp_traj in temp_trajs: # # ----------------pack up as transition--------------------------- # for (obs, acts, goals, achieved_goals) in temp_traj: # episode = dict(o=obs, # u=acts, # g=goals, # ag=achieved_goals) # for key, value in zip(self.info_keys, info_values): # episode['info_{}'.format(key)] = value # episodes.append(episode) # # ----------------end--------------------------- # # else: # # ----------------Kaleidoscope ER--------------------------- # original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals,n_KER) # # ----------------end--------------------------- # # ----------------pack up as transition--------------------------- # for (obs,acts,goals,achieved_goals) in original_ka_episodes: # episode = dict(o=obs, # u=acts, # g=goals, # ag=achieved_goals) # for key, value in zip(self.info_keys, info_values): # episode['info_{}'.format(key)] = value # episodes.append(episode) # # ----------------end--------------------------- # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) # if success_rate != 0: # set_trace() mul_factor = 1 self.n_episodes += (mul_factor* self.rollout_batch_size) # ----------------format processing--------------------------- # return dict: ['o', 'u', 'g', 'ag', 'info_is_success'] for episode in episodes: episode_batch = convert_episode_to_batch_major(episode) episodes_batch.append(episode_batch) # ----------------end--------------------------- return episodes_batch
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ # addition for multi-tasks structures # decides whether the next runs are made to compute progress (exploit = True, means no noise on actions). if (self.structure == 'curious' or self.structure == 'task_experts') and not self.eval: self.exploit = True if np.random.random() < 0.1 else False if self.exploit and self.structure == 'curious': self.p = 1 / self.nb_tasks * np.ones([self.nb_tasks]) elif self.eval: self.exploit = True self.p = 1 / self.nb_tasks * np.ones([self.nb_tasks]) self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['ag']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] # addition for multi-tasks structures if self.structure == 'curious' or self.structure == 'task_experts': task_descrs = [] changes = [] # True when the achieved goal (outcome) has changed compared to the initial achieved goal for t in range(self.T): # when evaluating task_experts, the policy corresponding to the demanded task must be selected if self.structure=='task_experts' and self.eval: act_output = np.zeros([self.rollout_batch_size, self.dims['u']]) q_output = np.zeros([self.rollout_batch_size, 1]) for i in range(self.rollout_batch_size): tsk = np.argwhere(self.task_descr[i] == 1).squeeze() act_output[i, :], q_output[i, 0] = self.policy[tsk].get_actions( o[i].reshape([1, o[i].size]), ag[i].reshape([1, ag[i].size]), self.g[i].reshape([1, self.g[i].size]), task_descr=self.task_descr[i].reshape([1, self.task_descr[i].size]), compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) policy_output = [act_output, q_output] else: policy_output = self.policy.get_actions( o, ag, self.g, task_descr = self.task_descr if self.structure == 'curious' else None, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['ag'])) success = np.zeros(self.rollout_batch_size) r_competence = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. if self.render: self.envs[i].render() curr_o_new, r_competence[i], _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] self.g[i] = curr_o_new['desired_goal'] # in case desired goal changes depending on observation for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new # addition for goal task selection if self.structure == 'curious' or self.structure == 'task_experts': task_descrs.append(self.task_descr.copy()) changes.append(np.abs(achieved_goals[0] - ag) > 1e-3) obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) # addition for multi-tasks structures if self.structure == 'curious' or self.structure == 'task_experts': episode['task_descr'] = task_descrs episode['change'] = changes self.initial_o[:] = o for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) self.reward_history.append(r_competence) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size * self.nb_cpu # addition for multi-tasks structures if self.structure == 'curious' or self.structure == 'task_experts': # only update competence if no noise has been used if self.exploit: tasks_for_competence = [self.envs[i].unwrapped.task for i in range(self.rollout_batch_size)] goals_for_competence = [self.envs[i].unwrapped.goal[self.tasks_g_id[tasks_for_competence[i]]] for i in range(self.rollout_batch_size)] full_goals_for_competence = [self.envs[i].unwrapped.goal for i in range(self.rollout_batch_size)] ag_for_competence = [achieved_goals[-1][i] for i in range(self.rollout_batch_size)] succ_list = successful.tolist() else: tasks_for_competence = [] goals_for_competence = [] full_goals_for_competence = [] ag_for_competence = [] succ_list = [] succ_list = MPI.COMM_WORLD.gather(succ_list, root=0) tasks_for_competence = MPI.COMM_WORLD.gather(tasks_for_competence, root=0) goals_for_competence = MPI.COMM_WORLD.gather(goals_for_competence, root=0) full_goals_for_competence = MPI.COMM_WORLD.gather(full_goals_for_competence, root=0) ag_for_competence = MPI.COMM_WORLD.gather(ag_for_competence, root=0) # update competence queues for each task in cpu rank 0 # compute next selection probabilities if self.rank == 0: tasks_for_competence = sum(tasks_for_competence, []) goals_for_competence = sum(goals_for_competence, []) succ_list = sum(succ_list, []) task_succ_list = [[] for _ in range(self.nb_tasks)] task_cp_list = [[] for _ in range(self.nb_tasks)] task_goal_list = [[] for _ in range(self.nb_tasks)] # update competence queues for succ, task in zip(succ_list, tasks_for_competence): task_succ_list[task].append(succ) for goal, task in zip(goals_for_competence, tasks_for_competence): task_goal_list[task].append(goal) for task in range(self.nb_tasks): self.competence_computers[task].update(task_succ_list[task]) # update competence and competence progress (learning progress) if self.goal_selection == 'active' and not self.eval: new_split, _ = self.goal_selectors[task].update(task_goal_list[task], task_succ_list[task]) if new_split: regions = self.goal_selectors[task].get_regions probas = self.goal_selectors[task].probas self.split_histories[task].append([regions, probas]) else: self.split_histories[task].append(None) self.C = np.array([self.get_C()]).squeeze() # get new updated competence measures # record all tasks self.task_history.extend(self.tasks.copy()) self.goal_history.extend(self.goals.copy()) # update task selection probabilities if active task selection if not self.eval: if self.task_selection == 'active_competence_progress' and self.structure != 'task_experts': # compute competence progress for each task self.CP = np.array([self.get_CP()]).squeeze() # softmax # exp_cp = np.exp(self.temperature*self.CP) # self.p = exp_cp / exp_cp.sum() # epsilon proportional epsilon = 0.4 if self.CP.sum() == 0: self.p = (1 / self.nb_tasks) * np.ones([self.nb_tasks]) else: self.p = epsilon * (1 / self.nb_tasks) * np.ones([self.nb_tasks]) + \ (1 - epsilon) * self.CP / self.CP.sum() if self.p.sum() > 1: self.p[np.argmax(self.p)] -= self.p.sum() - 1 elif self.p.sum() < 1: self.p[-1] = 1 - self.p[:-1].sum() elif self.structure == 'task_experts': self.p = np.zeros([self.nb_tasks]) self.p[self.unique_task] = 1 # broadcast the selection probability to all cpus and the competence if not self.eval: self.p = MPI.COMM_WORLD.bcast(self.p, root=0) self.CP = MPI.COMM_WORLD.bcast(self.CP, root=0) return convert_episode_to_batch_major(episode), self.CP, self.n_episodes
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [ np.empty( (self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] ####################### hrl ############################# Rt_high_sum = np.zeros((self.rollout_batch_size, 1), np.float32) total_timestep = 1 high_goal_gt = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) #high_goal_gt_tilda = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) high_old_obj_st = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) u_temp = np.empty((self.rollout_batch_size, self.dims['u']), np.float32) low_nn_at = np.zeros( (self.high_level_train_step * self.rollout_batch_size, self.dims['u']), np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['u']) low_nn_st = np.zeros( (self.high_level_train_step * self.rollout_batch_size, self.dims['o']), np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['o']) intrinsic_reward = np.zeros((self.rollout_batch_size, 1), np.float32) high_goal_gt[:] = self.initial_high_goal_gt #high_goal_gt_tilda[:] = self.initial_high_goal_gt_tilda ########################################################## for t in range(self.T): #print_point #print("cont t : ", t) #print("cont total_timestep : ", total_timestep) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) reward_new = np.zeros(self.rollout_batch_size) done_new = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): #print_point #print(" i : ", i) policy_output = self.policy.get_low_actions( # o, ag, self.g, o[i], ag[i], high_goal_gt[i], compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: # u, Q = policy_output u = policy_output ## print_point #print(" self.compute_Q u : ", u) Q = self.policy.Get_Q_value(o[i], high_goal_gt[i], u) Qs.append(Q) else: u = policy_output ## print_point #print(" self.compute_Q else u : ", u) if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) try: # We fully ignore the reward here because it will have to be re-computed # for HER. # curr_o_new, _, _, info = self.envs[i].step(u[i]) ##################################### hrl ############################### #curr_o_new, reward, done, info = self.envs[i].step(u[i]) # jangikim #print("u.reshape(4,)", u.reshape(4,)) curr_o_new, reward, done, info = self.envs[i].step( u.reshape(4, )) # jangikim ######################################################################### if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] #jangikim reward_new[i] = reward ## print_point #print(" curr_o_new [0] : ".format(i), curr_o_new) #done_new[i] = done #if success[i] == 1 or done==1: if success[i] == 1: # done_new[i] = 1 print("done_new[{0}] : ".format(i), 1) #else: # done_new[i] = 0 #done_new[i] = 0 if t + 1 == self.T else float(done) done_new[i] = 0 if total_timestep == self.T else float( done) for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() low_nn_at[i][t % self.high_level_train_step] = u low_nn_st[i][t % self.high_level_train_step] = o_new[i] Rt_high_sum[i] += reward_new[i] if total_timestep % self.high_level_train_step == 0: high_goal_gt[i] = self.policy.get_high_goal_gt( o[i], ag[i], self.g[i], compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) ''' high_goal_gt_tilda[i] = self.policy.get_high_goal_gt_tilda(high_old_obj_st[i], ag[i], self.g[i], o_new[i], low_nn_st[i], low_nn_at[i]) ''' self.policy.update_meta_controller( self.g[i], Rt_high_sum[i] * 0.1, done_new[i], low_nn_st[i], low_nn_at[i], int((self.total_timestep + 1) / self.high_level_train_step), ag[i]) high_old_obj_st[i] = o_new[i] low_nn_at[i] = np.zeros( (self.high_level_train_step, self.dims['u']), np.float32) low_nn_st[i] = np.zeros( (self.high_level_train_step, self.dims['o']), np.float32) Rt_high_sum[i] = 0 else: high_goal_gt[i] = o[i] + high_goal_gt[i] - o_new[i] u_temp[i] = u #temp_test = (t % self.high_level_train_step) intrinsic_reward[i] = -LA.norm(o[i] + high_goal_gt[i] - o_new[i]) self.policy.update_controller(o[i], o_new[i], high_goal_gt[i], u, intrinsic_reward[i], done_new[i], total_timestep) total_timestep += 1 self.total_timestep += 1 if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) #acts.append(u.copy()) acts.append(u_temp.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o ########################## hrl ######################### self.initial_high_goal_gt[:] = high_goal_gt #self.initial_high_goal_gt_tilda[:] = high_goal_gt_tilda ######################################################## episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self, render=False, test=False, exploit=False): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts(test) # Annealing if self.expert != None: beta = self.beta() else: beta = 0 # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes, returns, sigmas = [], [], [], [], [], [], [] info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] for t in range(self.T): if np.random.rand() < beta: # The expert is in charge o_, g_, ag_ = self.trim(o, self.g, ag, self.expert.dimo, self.expert.dimg) policy_output = self.expert.get_actions(o_, ag_, g_, compute_raw=True) u, raw = policy_output else: policy_output = self.policy.get_actions( o, ag, self.g, exploit=exploit) u, raw, sigma = policy_output # We can't report sigma accurately when we are using the expert if self.expert != None: sigma = np.zeros((self.rollout_batch_size, self.dims['u'])) if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) raw = raw.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # -------------- r_new = np.zeros(self.rollout_batch_size) # -------------- # compute new states and observations for i in range(self.rollout_batch_size): # print(u[i]) try: # We don't ignore reward here # because we need to compute the return curr_o_new, r, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] # -------------- r_new[i] = r # -------------- for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if render: self.envs[i].render() except MujocoException as e: self.logger.info(str(e)) self.logger.info('Exception thrown by Mujoco. Giving up on life...') assert(False) return self.generate_rollouts(render, test) if np.isnan(o_new).any(): self.logger.info('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts(test) return self.generate_rollouts(render, test) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(raw.copy()) goals.append(self.g.copy()) sigmas.append(sigma.copy()) # --------- returns.append(r_new.copy()) for t_ in range(t): r_new = r_new.copy() returns[t_] += self.gamma ** (t - t_) * r_new # --------- o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals, # -------- G=returns, sigma=sigmas) # -------- for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self, ex_init=None, record=False, random=False, log_hit_time=False): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ if not self.active: return self.reset_all_rollouts(ex_init, record=record) # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag qpos = np.empty((self.rollout_batch_size, self.dims['qpos']), np.float32) qvel = np.empty((self.rollout_batch_size, self.dims['qvel']), np.float32) qpos[:] = self.initial_qpos qvel[:] = self.initial_qvel num_envs = self.venv.num_envs random_action = self.policy._random_action(num_envs) reached_goal = [False] * num_envs hit_time = [None] * num_envs if random: self.exploration = 'random' else: self.exploration = 'eps_greedy' # 'go' # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs, qposes, qvels, hit_times = [], [], [], [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net, exploration=self.exploration, go=np.logical_not(reached_goal), random_action=random_action, ) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) # compute new states and observations obs_dict_new, _, done, info = self.venv.step(u) o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] qpos_new = obs_dict_new['qpos'] qvel_new = obs_dict_new['qvel'] success = np.array([i.get('is_success', 0.0) for i in info]) for e_idx, (suc, ht) in enumerate(zip(success, hit_time)): if suc and hit_time[e_idx] is None: hit_time[e_idx] = t reached_goal = [hit is not None for hit in hit_time] if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) qposes.append(qpos.copy()) qvels.append(qvel.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new qpos[...] = qpos_new qvel[...] = qvel_new obs.append(o.copy()) achieved_goals.append(ag.copy()) qposes.append(qpos.copy()) qvels.append(qvel.copy()) episode = dict( o=obs, u=acts, g=goals, ag=achieved_goals, qpos=qposes, qvel=qvels, # t=Ts ) if self.compute_Q: episode["Qs"] = Qs for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats if self.exploration != 'random': if self.exploration in ['go_explore', 'go']: successful = np.asarray( [1 if hit is not None else 0 for hit in hit_time]) elif self.exploration in ['eps_greedy']: successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) hit_times = np.asarray( [hit if hit is not None else 0 for hit in hit_time]) if log_hit_time: hit_time_mean = np.mean(hit_times) hit_time_std = np.std(hit_times) self.hit_time_mean_history.append(hit_time_mean) self.hit_time_std_history.append(hit_time_std) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self, FLAGS): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() print("마침내 generate_rollout!") # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] ####################### hrl ############################# # Rt_high_sum = np.zeros((self.rollout_batch_size, 1), np.float32) # total_timestep = 1 # high_goal_gt = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # #high_goal_gt_tilda = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # high_old_obj_st = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # u_temp = np.empty((self.rollout_batch_size, self.dims['u']), np.float32) # low_nn_at = np.zeros((self.high_level_train_step*self.rollout_batch_size, self.dims['u']), # np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['u']) # low_nn_st = np.zeros((self.high_level_train_step*self.rollout_batch_size, self.dims['o']), # np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['o']) # intrinsic_reward = np.zeros((self.rollout_batch_size, 1), np.float32) # high_goal_gt[:] = self.initial_high_goal_gt # #high_goal_gt_tilda[:] = self.initial_high_goal_gt_tilda ########################################################## for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) # FLAGS=FLAGS) # policy_output = self.policy.get_actions( # o, ag, self.g, # compute_Q=self.compute_Q, # noise_eps=self.noise_eps if not self.exploit else 0., # random_eps=self.random_eps if not self.exploit else 0., # use_target_net=self.use_target_net) ## from run_HAC.py # Determine training mode. If not testing and not solely training, interleave training and testing to track progress # mix_train_test = False # if not FLAGS.test and not FLAGS.train_only: # mix_train_test = True ## from run_HAC.py, 이 뒤로 다 indentation해줌 # Evaluate policy every TEST_FREQ batches if interleaving training and testing # if mix_train_test and t % TEST_FREQ == 0: # print("\n--- HAC TESTING ---") # # agent.FLAGS.test = True ## agent를 인스턴스로 받아야하나 ㅡㅡ # num_episodes = num_test_episodes # # Reset successful episode counter # successful_episodes = 0 if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # print("Rollout. o_new={}, ag_new={},success={}".format(o_new,ag_new,success)) # compute new states and observations obs_dict_new, _, done, info = self.venv.step(u) # print("HERE") # print("#########Debug##########") o_new = obs_dict_new['observation'] # print("observation high : {}".format(o_new)) ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset # 여기에서 우리는 모든 환경이 동일한 단계 수라고 가정합니다. # 그래서 envs가 vecenvs를 사용하여 수행 한 트릭을 반환 할 때마다 롤아웃을 종료합니다. # 왜냐하면 그것들은 이미 재설정된 후의 관찰이기 때문이다. break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): # self.logger.warn('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) # print("############## obs = {}".format(obs)) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations. Initialize array of zeros observations = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals # Whole array assigned observations[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes, ep_reward_list = [], [], [], [], [], [] ep_reward = 0 env_step_counter = 0 dones = [] # print(self.info_keys) info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] # Do for rollout time horizon. This is equal to 50 because episode length is 50 # Not really much use if we go for a longer trajectory. If anything, shorten and test results # TODO: Shorten trajectory and check results for t in range(self.T): policy_output = self.policy.get_actions( observations, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: # Evaluator only action, Q = policy_output Qs.append(Q) else: action = policy_output if action.ndim == 1: # The non-batched case should still have a reasonable shape. action = action.reshape(1, -1) new_observation = np.empty( (self.rollout_batch_size, self.dims['o'])) new_achieved_goal = np.empty( (self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations obs_dict_new, reward, done, info = self.venv.step(action) # obs_dict_new {'achieved_goal': array([[1.3519502 , 0.73200333, 0.5274352 ]], dtype=float32), # 'desired_goal': array([[1.2729537 , 0.62809974, 0.51270455]], dtype=float32), # 'observation': array([[ 1.3519502e+00, 7.3200333e-01, 5.2743518e-01, 0.0000000e+00, # 0.0000000e+00, 1.7498910e-03, -3.6469495e-03, -1.8837147e-03, # -5.2045716e-06, 1.0831429e-04]], dtype=float32)} # reward [-1.] # info [{'is_success': 0.0}] # print(reward) # ep_reward_list.append(ep_reward) ep_reward += reward env_step_counter += 1 # print("env_step_counter, ep_reward ", env_step_counter, ep_reward) new_observation = obs_dict_new['observation'] new_achieved_goal = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever # any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those # are already observations after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(new_observation).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(observations.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(action.copy()) goals.append(self.g.copy()) observations[...] = new_observation ag[...] = new_achieved_goal self.episode_counter += 1 self.episode_reward = ep_reward[ -1] # Appending total ep_reward to episode_reward # print("episode_counter, episode_reward", self.episode_counter, self.episode_reward) obs.append(observations.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): # print(key, value) episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) # print("success_rate: ", success_rate) self.success_history.append(success_rate) # Used for tensorboard # print(self.success_history) self.reward_history.append(self.episode_reward) # Used for tensorboard # print(self.reward_history) if self.compute_Q: # Evaluator only self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size # print("Rollout Done") return convert_episode_to_batch_major(episode)
def gen_rollouts_render(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag grip_poses = [] local_voxels = [] # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] for i in range(20): _, _, _, _ = self.venv.step(np.array([0,0,0,0])) for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=0, random_eps=0, use_target_net=self.use_target_net) # self.random_eps # if not self.exploit else 0., # if not self.exploit else 0., if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations # TODO: adding noise to the action? sigma = 0.1 mu = 0 # import pdb; pdb.set_trace() disc_actions = np.array([-0.5, 0.5]) # hacked_u = np.array([]) if np.random.uniform(0, 1) < 0.9: u = np.random.uniform(-0.5, 0.5, 4)#(sigma * np.random.randn(4) + mu) else: u = np.squeeze(u) distance = np.abs(u.reshape([-1, 1]) - disc_actions) u_idx = np.squeeze(np.argmin(distance, -1)) u = np.array([disc_actions[u_idx[i]] for i in range(4)]) u[1] = 0.5 # import pdb; # pdb.set_trace() # import pdb; pdb.set_trace() obs_dict_new, _, done, info = self.venv.step(u) grip_pos = self.venv.loc2grid(obs_dict_new['observation'][:3]) # print(np.sum([local_voxels[x] for x in local_voxels.keys()])) # import pdb; pdb.set_trace() # self.venv.render() o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] # import pdb; pdb.set_trace() success = info['is_success'] #np.array([i.get('is_success', 0.0) for i in info]) # print(f'hahahahah success {success}') # print(f'I am done {done}') if done or success == 1: # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break # for i, info_dict in enumerate(info): # for idx, key in enumerate(self.info_keys): # info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): self.logger.warn('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() grip_poses.append(grip_pos) # harry added it local_voxels.append(info['local_voxel']) dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=np.squeeze(obs), u=np.squeeze(acts), g=goals, ag=achieved_goals, grip_pos=np.array(grip_poses), local_voxels=local_voxels) # for key, value in zip(self.info_keys, info_values): # episode['info_{}'.format(key)] = value # stats # successful = np.array(successes) # assert successful.shape == (self.rollout_batch_size,) # success_rate = np.mean(successful) # self.success_history.append(success_rate) # if self.compute_Q: # self.Q_history.append(np.mean(Qs)) # self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts_ker(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() episodes = [] episodes_batch = [] # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) # i.e. from (4,) to (1,4) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations, do not return the reward, and get it from her_sampler.py obs_dict_new, _, done, info = self.venv.step(u) o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) # no need if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break # no need for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] # no need if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) # Set s = s' for new step o[...] = o_new ag[...] = ag_new # Append for last time step obs.append(o.copy()) achieved_goals.append(ag.copy()) # ----------------Kaleidoscope ER--------------------------- original_ka_episodes = self.ker.ker_process( obs, acts, goals, achieved_goals ) # KER augments original episodes by an amount of 2*n_ker # ----------------end--------------------------- # ----------------pack up as transition--------------------------- for (obs, acts, goals, achieved_goals) in original_ka_episodes: episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value episodes.append(episode) # ----------------end--------------------------- # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) mul_factor = 1 self.n_episodes += (mul_factor * self.rollout_batch_size) # ----------------format processing--------------------------- # return dict: ['o', 'u', 'g', 'ag', 'info_is_success'] for episode in episodes: episode_batch = convert_episode_to_batch_major( episode) # i.e. from 50,1,25 to 1,50,25 episodes_batch.append(episode_batch) # ----------------end--------------------------- return episodes_batch
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes, successes_pos = [], [], [], [], [], [] info_values = [ np.empty( (self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) success_pos = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'][1] success_pos[i] = info['is_success'][0] if 'done' in info: self.first_policy_done = info['done'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() if self.first_policy_done: break if np.isnan(o_new).any(): self.logger.warning( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) successes_pos.append(success_pos.copy()) acts.append(u.copy()) goals.append(self.g.copy()) #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1)) o[...] = o_new ag[...] = ag_new episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] successful_pos = np.array(successes_pos)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) success_rate_pos = np.mean(successful_pos) self.success_history.append(success_rate) self.success_pos_history.append(success_rate_pos) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o for t in range(self.T): # self.g = np.array([[1, 1, 1, 1, # 0.82950088, 0.19504257, 0.74951634, 0.82558665, 0.19408095, 0.72752193, # 0.8294237, 0.19509856, 0.70551644, 0.83616574, 0.19685965, 0.6825068]], 'Float32') self.g = np.array([[ 1, 1, 1, 1, 0.81399449, 0.08906187, 0.36651383, 0.80723628, 0.08749478, 0.34525658, 0.80821288, 0.08766061, 0.32291785, 0.81195864, 0.08844444, 0.29918275 ]], 'Float32') policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) success_pos = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'][1] success_pos[i] = info['is_success'][0] if 'done' in info: self.first_policy_done = info['done'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() if self.first_policy_done: break if np.isnan(o_new).any(): self.logger.warning( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) successes_pos.append(success_pos.copy()) acts.append(u.copy()) goals.append(self.g.copy()) #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1)) o[...] = o_new ag[...] = ag_new # for t in range(self.T): # # # self.g = np.array([[1, 1, 1, 1, 0.77939238, # # 0.01007279, # # 0.77396591, # # 0.78406789, # # 0.01003857, # # 0.75209954, # # 0.7807472, # # 0.01000307, # # 0.72998683, # # 0.77445873, # # 0.00996551, # # 0.70678223, # # 0.86935003, # # 0.00708711, # # 0.7767419, # # 0.87402555, # # 0.00705288, # # 0.75487552, # # 0.87070486, # # 0.00701738, # # 0.73276282, # # 0.86441638, # # 0.00697982, # # 0.70955821]], 'Float32') # # # # policy_output = self.policy.get_actions( # o, ag, self.g, # compute_Q=self.compute_Q, # noise_eps=self.noise_eps if not self.exploit else 0., # random_eps=self.random_eps if not self.exploit else 0., # use_target_net=self.use_target_net) # # # if self.compute_Q: # u, Q = policy_output # Qs.append(Q) # else: # u = policy_output # # if u.ndim == 1: # # The non-batched case should still have a reasonable shape. # u = u.reshape(1, -1) # # o_new = np.empty((self.rollout_batch_size, self.dims['o'])) # ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) # success = np.zeros(self.rollout_batch_size) # success_pos = np.zeros(self.rollout_batch_size) # # compute new states and observations # for i in range(self.rollout_batch_size): # try: # # We fully ignore the reward here because it will have to be re-computed # # for HER. # curr_o_new, _, _, info = self.envs[i].step(u[i]) # if 'is_success' in info: # success[i] = info['is_success'][1] # success_pos[i] = info['is_success'][0] # o_new[i] = curr_o_new['observation'] # ag_new[i] = curr_o_new['achieved_goal'] # for idx, key in enumerate(self.info_keys): # info_values[idx][t, i] = info[key] # if self.render: # self.envs[i].render() # except MujocoException as e: # return self.generate_rollouts() # # if np.isnan(o_new).any(): # self.logger.warning('NaN caught during rollout generation. Trying again...') # self.reset_all_rollouts() # return self.generate_rollouts() # # obs.append(o.copy()) # achieved_goals.append(ag.copy()) # successes.append(success.copy()) # successes_pos.append(success_pos.copy()) # acts.append(u.copy()) # goals.append(self.g.copy()) # #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1)) # o[...] = o_new # ag[...] = ag_new return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ directory_plot = '../../../test_plot_png/' + datetime.datetime.now().strftime("%m%d_%H%M%S") + os.sep directory_env = '../../../test_env_png/' + datetime.datetime.now().strftime("%m%d_%H%M%S") + os.sep self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] x_bar = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25] x_lab = ["grip_pos-1","grip_pos-2","grip_pos-3","object_pos-1","object_pos-2","object_pos-3","object_rel_pos-1","object_rel_pos-2","object_rel_pos-3", "gripper_state-1", "gripper_state-2", "object_rot-1", "object_rot-2", "object_rot-3", "object_velp-1", "object_velp-2", "object_velp-3", "object_velr-1", "object_velr-2", "object_velr-3", "grip_velp-1", "grip_velp-2", "grip_velp-3", "gripper_vel-1", "gripper_vel-2"] observation_catcher = [] observation_catcher_1 = [] observation_catcher_2 = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() elif self.rendder_and_save_png: #ndrw rgb_array = self.envs[i].render(mode='rgb_array') im = Image.fromarray(rgb_array) lov = im.crop((300,200,1000,650)) observation_catcher.append(o_new[i][0]) # use to append the requried set values from the observation vector (0-24) observation_catcher_1.append(o_new[i][1]) observation_catcher_2.append(o_new[i][2]) ax1.clear() ax1.plot(observation_catcher,color='xkcd:coral',linewidth=3,marker='o',markevery=[-1]) ax1.plot(observation_catcher_1,color='xkcd:green',linewidth=3,marker='o',markevery=[-1]) ax1.plot(observation_catcher_2,color='xkcd:goldenrod',linewidth=3,marker='o',markevery=[-1]) ax1.set_xlabel('Time-Step',fontsize=11) ax1.set_ylabel('Observation-Values',fontsize=11) ax1.legend(['gripper_vel-1','gripper_vel-2'], loc = 'upper right',facecolor='#74dd93',frameon=False,fontsize='x-small', ncol=3, bbox_to_anchor=(1,1.03)) ax1.set_facecolor('#74dd93') ax1.set_xlim(xmin=0) ax1.set_xlim(xmax=50) ax1.set_ylim(ymin=0.4) # default value --> should be checked according the y min in observed value - hard coded ax1.set_ylim(ymax=1.4) # default value --> should be checked according the y max in observed value - hard coded ax2.clear() barlist = ax2.bar(x_bar,color='xkcd:silver',width=0.4,height=0.025) barlist[0].set_color('xkcd:coral') barlist[1].set_color('xkcd:green') barlist[2].set_color('xkcd:goldenrod') ax2.set_yticklabels([]) ax2.set_xticks(x_bar) ax2.set_xticklabels(x_lab,rotation=90,fontsize=9) ax2.set_title('Observation Vector Of The Two-Finger Gripper(NN-Input)',fontsize=12) ax2.set_facecolor('#74dd93') ax2.set_frame_on(False) ax2.axes.get_yaxis().set_visible(False) if not os.path.exists(directory_plot): os.makedirs(directory_plot) if not os.path.exists(directory_env): os.makedirs(directory_env) plt.savefig(directory_plot + "pic_{0:05d}.png".format(t),facecolor=fig.get_facecolor(), edgecolor='none') lov.save(directory_env + "pic_{0:05d}.png".format(t)) except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warning('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size #print(observation_catcher_2) return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] Qs = [] for t in range(self.T): policy_output = self.policy.get_actions( o, ag, self.g, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations for i in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. curr_o_new, _, _, info = self.envs[i].step(u[i]) if 'is_success' in info: success[i] = info['is_success'] o_new[i] = curr_o_new['observation'] ag_new[i] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[key] if self.render: self.envs[i].render() except MujocoException as e: return self.generate_rollouts() if np.isnan(o_new).any(): self.logger.warn('NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) o[...] = o_new ag[...] = ag_new obs.append(o.copy()) achieved_goals.append(ag.copy()) self.initial_o[:] = o episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)
def generate_rollouts(self): """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current policy acting on it accordingly. """ self.reset_all_rollouts() # compute observations o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals o[:] = self.initial_o ag[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] consistent_sgss = [] dones = [] info_values = [ np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys ] Qs = [] # print("new ep") # g_index = 0 g_indices = [0] * self.rollout_batch_size # self.policies.g_index = 0 for t in range(self.T): # policy_output = self.policy.get_actions( # # o, ag, self.gs[self.g_index], # # o, ag, self.g, # o, ag, self.gs[2], # compute_Q=self.compute_Q, # noise_eps=self.noise_eps if not self.exploit else 0., # random_eps=self.random_eps if not self.exploit else 0., # use_target_net=self.use_target_net) # print(o) # print(o.shape) # print(ag.shape) # print(self.gs) # print(self.gs.shape) # print(self.gs[0][g_index]) # print(self.g) # print(self.gs[0][g_index].shape) #num_env = 2: (same with num_cpu = n) #2,25 | 1,25 #2,3 | 1,3 #2,3,3 | 1,3,3 #[1.47,.62,.45] | [1.46,.62,.45] #3, | 3, # [g[i][g_inds[i]] for i in range(len(g_inds))] # sgs = np.array([a[b] for a,b in zip(self.gs,g_indices)]) self.g = np.array([a[b] for a, b in zip(self.gs, g_indices)]) # print(self.gs) # print(g_indices) # print(self.g) policy_output = self.policies.get_actions( # o, ag, self.gs, g_index, # o, ag, self.gs[0][g_index], g_index, # o, ag, sgs, g_indices, o, ag, self.g, g_indices, compute_Q=self.compute_Q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) if self.compute_Q: u, Q = policy_output Qs.append(Q) else: u = policy_output if u.ndim == 1: # The non-batched case should still have a reasonable shape. u = u.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations obs_dict_new, rewards, done, info = self.venv.step(u) #TODO: All definitely only works for one env, extend for any num # g_index_new = obs_dict_new['goal_index'] #make sure this doesn't change outside of this # consistent_sgs = info[0]['consistent_subgoals'] consistent_sgs = np.array( [i.get('consistent_subgoals', 0.0) for i in info]) o_new = obs_dict_new['observation'] ag_new = obs_dict_new['achieved_goal'] success = np.array([i.get('is_success', 0.0) for i in info]) # self.g_index = g_index_new #update goal/goal_index if we achieve a subgoal for i in np.where(rewards != -1)[0]: # print(i) g_indices[i] = min(g_indices[i] + 1, self.policies.num_goals - 1) # print("?") # self.g = [self.gs[:,g_indices]] # if reward != -1 and g_index < len(self.gs[0])-1:#[0])-1: # g_index += 1 # #would have to be of len(numenvs) # self.g = [self.gs[0][g_index]] # #identify transition as candidate for subgoal experience replay # for i in range(len(consistent_sgs)): # if consistent_sgs[i] == 1: # self.subgoal_timesteps[i].append(t) # if g_index_new != self.g_index: # self.subgoal_timesteps.append(t) # self.g_index = g_index_new if any(done): # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations # after a reset break for i, info_dict in enumerate(info): for idx, key in enumerate(self.info_keys): info_values[idx][t, i] = info[i][key] if np.isnan(o_new).any(): self.logger.warn( 'NaN caught during rollout generation. Trying again...') self.reset_all_rollouts() return self.generate_rollouts() consistent_sgss.append(consistent_sgs.copy()) dones.append(done) obs.append(o.copy()) achieved_goals.append(ag.copy()) successes.append(success.copy()) acts.append(u.copy()) goals.append(self.g.copy()) # goals.append(self.gs[self.g_index].copy()) o[...] = o_new ag[...] = ag_new #in case subgoal was achieved # self.g = obs_dict_new['desired_goal'].copy() # if reward != -1 and self.g_index < len(self.goals): # self.g_index += 1 obs.append(o.copy()) achieved_goals.append(ag.copy()) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals, sgt=consistent_sgss) for key, value in zip(self.info_keys, info_values): episode['info_{}'.format(key)] = value # stats successful = np.array(successes)[-1, :] assert successful.shape == (self.rollout_batch_size, ) success_rate = np.mean(successful) self.success_history.append(success_rate) if self.compute_Q: self.Q_history.append(np.mean(Qs)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode)