class Agent: def __init__(self, n_states, n_actions, n_goals, action_bounds, capacity, env, k_future, batch_size, action_size=1, tau=0.05, actor_lr=1e-3, critic_lr=1e-3, gamma=0.98): self.device = device("cpu") self.n_states = n_states self.n_actions = n_actions self.n_goals = n_goals self.k_future = k_future self.action_bounds = action_bounds self.action_size = action_size self.env = env self.actor = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.sync_networks(self.actor) self.sync_networks(self.critic) self.actor_target = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic_target = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.init_target_networks() self.tau = tau self.gamma = gamma self.capacity = capacity self.memory = Memory(self.capacity, self.k_future, self.env) self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_optim = Adam(self.actor.parameters(), self.actor_lr) self.critic_optim = Adam(self.critic.parameters(), self.critic_lr) self.state_normalizer = Normalizer(self.n_states[0], default_clip_range=5) self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5) def choose_action(self, state, goal, train_mode=True): #takes state and goal, concatenates it and passes it to actor network #actor returns action, to which random weird noises are added and returned state = self.state_normalizer.normalize(state) goal = self.goal_normalizer.normalize(goal) state = np.expand_dims(state, axis=0) goal = np.expand_dims(goal, axis=0) with torch.no_grad(): x = np.concatenate([state, goal], axis=1) x = from_numpy(x).float().to(self.device) action = self.actor(x)[0].cpu().data.numpy() if train_mode: action += 0.2 * np.random.randn(self.n_actions) action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) random_actions = np.random.uniform(low=self.action_bounds[0], high=self.action_bounds[1], size=self.n_actions) action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action) return action def store(self, mini_batch): for batch in mini_batch: self.memory.add(batch) self._update_normalizer(mini_batch) def init_target_networks(self): self.hard_update_networks(self.actor, self.actor_target) self.hard_update_networks(self.critic, self.critic_target) @staticmethod def hard_update_networks(local_model, target_model): target_model.load_state_dict(local_model.state_dict()) @staticmethod def soft_update_networks(local_model, target_model, tau=0.05): for t_params, e_params in zip(target_model.parameters(), local_model.parameters()): t_params.data.copy_(tau * e_params.data + (1 - tau) * t_params.data) def train(self): states, actions, rewards, next_states, goals = self.memory.sample( self.batch_size) states = self.state_normalizer.normalize(states) next_states = self.state_normalizer.normalize(next_states) goals = self.goal_normalizer.normalize(goals) inputs = np.concatenate([states, goals], axis=1) next_inputs = np.concatenate([next_states, goals], axis=1) inputs = torch.Tensor(inputs).to(self.device) rewards = torch.Tensor(rewards).to(self.device) next_inputs = torch.Tensor(next_inputs).to(self.device) actions = torch.Tensor(actions).to(self.device) with torch.no_grad(): #get Qmax target_q = self.critic_target(next_inputs, self.actor_target(next_inputs)) #apply bellman equation on Qmax to get computed Q for actions from above(initial state, action) target_returns = rewards + self.gamma * target_q.detach() target_returns = torch.clamp(target_returns, -1 / (1 - self.gamma), 0) #use critic to generate actual Q for (initial states and actions) q_eval = self.critic(inputs, actions) critic_loss = (target_returns - q_eval).pow(2).mean() a = self.actor(inputs) actor_loss = -self.critic(inputs, a).mean() actor_loss += a.pow(2).mean() self.actor_optim.zero_grad() actor_loss.backward() self.sync_grads(self.actor) self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.sync_grads(self.critic) self.critic_optim.step() return actor_loss.item(), critic_loss.item() def save_weights(self): torch.save( { "actor_state_dict": self.actor.state_dict(), "state_normalizer_mean": self.state_normalizer.mean, "state_normalizer_std": self.state_normalizer.std, "goal_normalizer_mean": self.goal_normalizer.mean, "goal_normalizer_std": self.goal_normalizer.std }, "NBM_FetchPickAndPlace_v2.pth") def load_weights(self): checkpoint = torch.load("NBM_FetchPickAndPlace_v2.pth") actor_state_dict = checkpoint["actor_state_dict"] self.actor.load_state_dict(actor_state_dict) state_normalizer_mean = checkpoint["state_normalizer_mean"] self.state_normalizer.mean = state_normalizer_mean state_normalizer_std = checkpoint["state_normalizer_std"] self.state_normalizer.std = state_normalizer_std goal_normalizer_mean = checkpoint["goal_normalizer_mean"] self.goal_normalizer.mean = goal_normalizer_mean goal_normalizer_std = checkpoint["goal_normalizer_std"] self.goal_normalizer.std = goal_normalizer_std def set_to_eval_mode(self): self.actor.eval() # self.critic.eval() def update_networks(self): self.soft_update_networks(self.actor, self.actor_target, self.tau) self.soft_update_networks(self.critic, self.critic_target, self.tau) def _update_normalizer(self, mini_batch): states, goals = self.memory.sample_for_normalization(mini_batch) self.state_normalizer.update(states) self.goal_normalizer.update(goals) self.state_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() @staticmethod def sync_networks(network): comm = MPI.COMM_WORLD flat_params = _get_flat_params_or_grads(network, mode='params') comm.Bcast(flat_params, root=0) _set_flat_params_or_grads(network, flat_params, mode='params') @staticmethod def sync_grads(network): flat_grads = _get_flat_params_or_grads(network, mode='grads') comm = MPI.COMM_WORLD global_grads = np.zeros_like(flat_grads) comm.Allreduce(flat_grads, global_grads, op=MPI.SUM) _set_flat_params_or_grads(network, global_grads, mode='grads')
class Planner(object): @store_args def __init__(self, inp_dim, hid_size, seq_len, out_dim, buffer_size, batch_size=64, optim_stepsize=1e-3, sample_func=None, norm_eps=1e-2, norm_clip=5, scope='planner', layerNorm=False, **kwargs): ''' Implemention of LSTM Planner that produces given number of subgoals between src and dest. Args: inp_dim : dimension for the LSTM hid_size : cell_state_size seq_len : max_timesteps out_dim : dimension for LSTM output ''' # self.main = lstm(hid_size, layerNorm) self.adamepsilon = 1e-6 self.mode = tf.contrib.learn.ModeKeys.TRAIN # TRAIN for training, INFER for prediction, EVAL for evaluation self.infer_outputs = None with tf.variable_scope(self.scope): self._create_network() buffer_shape = [ seq_len + 2, out_dim ] # plus 2: the [0] is 'src', [1] is 'dest', [2:] are 'labels', if self.sample_func is None: from sampler import make_sample_plans self.sample_func = make_sample_plans() self.buffer = PlanReplayBuffer(buffer_shape, buffer_size, self.sample_func) def _create_network(self): self.sess = U.get_session() self.inp_src = tf.placeholder(shape=[None, 1, self.inp_dim], dtype=tf.float32, name='input_src') self.inp_dest = tf.placeholder(shape=[None, 1, self.out_dim], dtype=tf.float32, name='input_dest') self.labels = tf.placeholder(shape=[None, self.seq_len, self.out_dim], dtype=tf.float32, name='label') self.src_seq_len = tf.placeholder(tf.int32, (None, ), name='source_sequence_length') self.tar_seq_len = tf.placeholder(tf.int32, (None, ), name='target_sequence_length') # running averages # with tf.variable_scope('goal_stats_src'): # self.goal_stats_src = Normalizer(self.inp_dim, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('goal_stats_dest'): self.goal_stats_dest = Normalizer(self.out_dim, self.norm_eps, self.norm_clip, sess=self.sess, PLN=True) # normalize inp_src, and goals labels inp_src = self.goal_stats_dest.normalize(self.inp_src) inp_dest = self.goal_stats_dest.normalize(self.inp_dest) goal_labels = self.goal_stats_dest.normalize(self.labels) with tf.variable_scope('goal_gen'): encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, inp_src, sequence_length=self.src_seq_len, dtype=tf.float32) decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) project_layer = tf.layers.Dense(self.out_dim) with tf.variable_scope("decode"): train_inp = tf.concat([inp_dest, goal_labels[:, :-1, :]], axis=-2) train_helper = tf.contrib.seq2seq.TrainingHelper( train_inp, sequence_length=self.tar_seq_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, train_helper, encoder_state, output_layer=project_layer) train_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( train_decoder, maximum_iterations=self.seq_len) self.train_outputs = train_outputs.rnn_output with tf.variable_scope("decode", reuse=True): infer_helper = ContinousInferHelper(inp_dest[:, 0, :], self.tar_seq_len) infer_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, infer_helper, encoder_state, output_layer=project_layer) infer_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( infer_decoder, maximum_iterations=self.seq_len) self.infer_outputs = self.goal_stats_dest.denormalize( infer_outputs.rnn_output) log_sigma = tf.get_variable(name="logstd", shape=[1, self.out_dim], initializer=U.normc_initializer(0.1)) goals = train_outputs.rnn_output loss = 0.5 * tf.reduce_sum(tf.square((goal_labels - goals)/tf.exp(log_sigma)), axis=-1) \ + 0.5 * np.log(2*np.pi) * tf.to_float(tf.shape(self.labels)[-1]) \ + tf.reduce_sum(log_sigma, axis=-1) self.loss = tf.reduce_mean(loss) self.tr_outputs = self.goal_stats_dest.denormalize( self.train_outputs ) # just for inspect the correctness of training var_list = self._vars('') self.grads = U.flatgrad(self.loss, var_list) self.adam = MpiAdam(var_list, epsilon=self.adamepsilon) tf.variables_initializer(self._global_vars('')).run() self.adam.sync() def train(self, use_buffer=False, justEval=False, **kwargs): self.mode = tf.contrib.learn.ModeKeys.TRAIN if not use_buffer: src = np.reshape(kwargs['src'], [-1, 1, self.inp_dim]) dest = np.reshape(kwargs['dest'], [-1, 1, self.out_dim]) lbl = kwargs['lbl'] else: episode_batch = self.buffer.sample(self.batch_size) src = np.reshape(episode_batch[:, 0, :], [-1, 1, self.inp_dim]) lbl = episode_batch[:, 2:, :] dest = np.reshape(episode_batch[:, 1, :], [-1, 1, self.out_dim]) src_seq_len = [1] * src.shape[0] tar_seq_len = [self.seq_len] * dest.shape[0] # compute grads loss, g, tr_sub_goals, te_sub_goals = self.sess.run( [self.loss, self.grads, self.tr_outputs, self.infer_outputs], feed_dict={ self.inp_src: src, self.inp_dest: dest, self.labels: lbl, self.src_seq_len: src_seq_len, self.tar_seq_len: tar_seq_len }) if not justEval: self.adam.update(g, stepsize=self.optim_stepsize) return loss, tr_sub_goals[-1], te_sub_goals[-1] def plan(self, src, dest): src = np.reshape(src, [-1, 1, self.inp_dim]) dest = np.reshape(dest, [-1, 1, self.out_dim]) src_seq_len = [1] * src.shape[0] tar_seq_len = [self.seq_len] * dest.shape[0] plan_goals = self.sess.run(self.infer_outputs, feed_dict={ self.inp_src: src, self.inp_dest: dest, self.src_seq_len: src_seq_len, self.tar_seq_len: tar_seq_len }) assert plan_goals.shape[0] == src.shape[0] and plan_goals.shape[ 1] == self.seq_len plan_goals = np.flip(plan_goals, axis=-2) plan_goals = np.concatenate([plan_goals, dest], axis=-2) # append the ultimate goal return plan_goals def store_episode(self, episode_batch, update_stats=True): """ episode_batch : [batch_size * (subgoal_num+1) * subgoal_dim] """ isNull = episode_batch.shape[0] < 1 if not isNull: self.buffer.store_episode(episode_batch) # logger.info("buffer store_episode done. updating statistics.") if update_stats: subgoals = episode_batch[:, 1:, :] self.goal_stats_dest.update(subgoals, isNull=isNull) # logger.info("ready to recomput_stats") # print(subgoals) self.goal_stats_dest.recompute_stats(inc=episode_batch.shape[0]) def update_normalizer_stats(self, batch): # self.goal_stats_src.update(batch['src']) self.goal_stats_dest.update(batch['dest']) # self.goal_stats_src.recompute_stats() self.goal_stats_dest.recompute_stats() def _vars(self, scope): res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) assert len(res) > 0 return res def _global_vars(self, scope): res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) return res def save(self, save_path): assert self.infer_outputs is not None var_list = self._global_vars('') U.save_variables(save_path, variables=var_list, sess=self.sess) def load(self, load_path): if self.infer_outputs is None: self._create_network() var_list = self._global_vars('') U.load_variables(load_path, variables=var_list) def logs(self, prefix=''): logs = [] logs += [('subgoals/buff_size', self.buffer.get_current_episode_size()) ] logs += [('goals/mean', np.mean(self.sess.run([self.goal_stats_dest.mean])))] logs += [('goals/std', np.mean(self.sess.run([self.goal_stats_dest.std])))] if prefix != '': prefix = prefix.strip('/') return [(prefix + '/' + key, val) for key, val in logs] else: return logs
class DdpgHer(object): _default_config = { 'n_epochs': 50, 'n_cycles': 50, 'n_batches': 40, 'checkpoint_freq': 5, 'seed': 123, 'num_workers': 1, 'replay_strategy': 'future', 'clip_return': 50., 'noise_eps': 0.2, 'random_eps': 0.3, 'buffer_size': int(1e6), 'replay_k': 4, 'clip_obs': 200., 'batch_size': 256, 'hidden_units': 256, 'gamma': 0.98, 'action_l2': 1., 'lr_actor': 0.001, 'lr_critic': 0.001, 'polyak': 0.95, 'n_test_rollouts': 10, 'clip_range': 5., 'demo_length': 20, 'local_dir': None, 'cuda': None, 'max_gpus': None, 'rollouts_per_worker': 2, 'goal_space_bins': None, 'archer_params': None, 'q_filter': False, 'prm_loss_weight': 0.001, 'aux_loss_weight': 0.0078, 'demo_batch_size': None, 'demo_file': None, 'num_demo': 100, } def __init__(self, env, config, reporter=None): super(DdpgHer).__init__() self.env = env self.config = {**DdpgHer._default_config, **config} self.seed(self.config['seed']) a_space, obs_space = self.env.action_space, self.env.observation_space obs_size = obs_space.spaces['observation'].shape[0] goal_size = obs_space.spaces['desired_goal'].shape[0] self.env_params = get_env_params(self.env) self.reporter = reporter if self.config['cuda'] is None: self.config['cuda'] = torch.cuda.is_available() if self.config['cuda']: n_gpus = torch.cuda.device_count() assert n_gpus > 0 max_gpus = self.config['max_gpus'] if max_gpus is None: max_gpus = n_gpus n_gpus = min(n_gpus, max_gpus) n_workers = MPI.COMM_WORLD.size rank = MPI.COMM_WORLD.rank w_per_gpu = int(np.ceil(n_workers / n_gpus)) gpu_i = rank // w_per_gpu print(f'Worker with rank {rank} assigned GPU {gpu_i}.') torch.cuda.set_device(gpu_i) self.bc_loss = self.config.get('demo_file') is not None self.q_filter = self.config['q_filter'] # create the network self.actor_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = ActorNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) self.critic_target_network = CriticNetwork( action_space=a_space, observation_space=obs_space, hidden_units=self.config['hidden_units']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.config['cuda']: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.config['lr_actor']) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.config['lr_critic']) # goal_space_bins should be of the form: # [dict(axis=0, box=np.linspace(0.0, 2.0, 15)), dict(axis=1, box=np.linspace(0.0, 2.0, 15)), ...] weight_her_sampling = False self._num_reached_goals_in_bin = None self._num_visited_goals_in_bin = None self._num_observed_goals_in_bin = None self._goal_space_bins = self.config['goal_space_bins'] if self._goal_space_bins is not None: weight_her_sampling = True self._num_reached_goals_in_bin = np.zeros( tuple(1 + b['box'].size for b in self._goal_space_bins)) self._num_visited_goals_in_bin = self._num_reached_goals_in_bin.copy( ) self._num_observed_goals_in_bin = self._num_reached_goals_in_bin.copy( ) # her sampler self.her_module = HerSampler( self.config['replay_strategy'], self.config['replay_k'], self.env.compute_reward, weight_sampling=weight_her_sampling, archer_params=self.config['archer_params']) # create the normalizer self.o_norm = Normalizer(size=obs_size, default_clip_range=self.config['clip_range']) self.g_norm = Normalizer(size=goal_size, default_clip_range=self.config['clip_range']) # create the replay and demo buffers self.buffer = ReplayBuffer(self.env_params, self.config['buffer_size'], self.her_module.sample_her_transitions) self.demo_buffer = None if self.bc_loss: self._init_demo_buffer(update_stats=True) self._trained = False def _bin_idx_for_goals(self, goals: np.ndarray): assert self._goal_space_bins is not None return tuple( np.digitize(goals[..., b['axis']], b['box'], right=False) for b in self._goal_space_bins) def _get_info_for_goals(self, goals: np.ndarray): assert self._goal_space_bins is not None idx = self._bin_idx_for_goals(goals) times_success = self._num_reached_goals_in_bin[idx] times_visited = self._num_visited_goals_in_bin[idx] times_observed = self._num_observed_goals_in_bin[idx] tot_success = self._num_reached_goals_in_bin.sum() tot_visited = self._num_visited_goals_in_bin.sum() tot_observed = self._num_observed_goals_in_bin.sum() return ( times_success, tot_success, times_visited, tot_visited, times_observed, tot_observed, ) def seed(self, value): import random np.random.seed(value) random.seed(value) torch.manual_seed(value) self.env.seed(value) def _training_step(self): rollout_times = [] update_times = [] update_results = [] taken_steps = 0 failed_steps = 0 sampling_tot_time = 0.0 sampling_calls = 0 step_tic = datetime.now() for _ in range(self.config['n_cycles']): mb_obs, mb_ag, mb_g, mb_actions = [], [], [], [] while len(mb_obs) < self.config["rollouts_per_worker"]: tic = datetime.now() step_failure = False # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions = [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] if self._goal_space_bins is not None: goal_idx = self._bin_idx_for_goals(g) self._num_observed_goals_in_bin[goal_idx] += 1 # start to collect samples for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) action = self._select_actions(pi) try: observation_new, _, _, info = self.env.step(action) except MujocoException: step_failure = True break obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self._goal_space_bins is not None: goal_idx = self._bin_idx_for_goals(ag_new) self._num_visited_goals_in_bin[goal_idx] += 1 if bool(info['is_success']): self._num_reached_goals_in_bin[goal_idx] += 1 # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) if step_failure: failed_steps += 1 continue taken_steps += self.env_params['max_timesteps'] mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) rollout_times.append((datetime.now() - tic).total_seconds()) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions]) self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions]) tic = datetime.now() # train the network for _ in range(self.config['n_batches']): # sample the episodes sampling_tic = datetime.now() sampled_transitions = self._sample_batch() sampling_tot_time += (datetime.now() - sampling_tic).total_seconds() sampling_calls += 1 res = self._update_network(sampled_transitions) update_results.append(res) # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network(self.critic_target_network, self.critic_network) update_times.append((datetime.now() - tic).total_seconds()) step_time = (datetime.now() - step_tic).total_seconds() tic = datetime.now() success_rate, avg_ep_reward = self._eval_agent() eval_time = (datetime.now() - tic).total_seconds() update_results_dict = dict() for k in update_results[0].keys(): update_results_dict['avg_' + k] = np.mean( [r[k] for r in update_results]) return { "test_success_rate": success_rate, "test_mean_ep_reward": avg_ep_reward, "avg_her_sampling_time": sampling_tot_time / sampling_calls, "avg_rollout_time": np.mean(rollout_times), "avg_network_update_time": np.mean(update_times), "evaluation_time": eval_time, "step_time": step_time, "env_steps": taken_steps, "failed_steps": failed_steps, **update_results_dict, } def _init_demo_buffer(self, update_stats=True): assert self.bc_loss file_path = self.config['demo_file'] num_demo = self.config['num_demo'] self.demo_buffer = ReplayBuffer(self.env_params, self.config['buffer_size'], self.her_module.sample_her_transitions) # data must be a dictionary of (at least) 4 lists; each list contains partial information for each episode. data = pickle.load(open(file_path, 'rb')) assert isinstance(data, dict) ordered_data = [] for k in ['mb_obs', 'mb_ag', 'mb_g', 'mb_actions']: mb_data = np.asarray(data[k]) assert len(mb_data) >= num_demo ordered_data.append(mb_data[:num_demo]) self.demo_buffer.store_episode(ordered_data) if update_stats: self._update_normalizer(ordered_data) def _sample_batch(self): batch_size = self.config['batch_size'] sample_kwargs = dict() if self._goal_space_bins is not None: sample_kwargs['get_info_for_goals'] = self._get_info_for_goals if self.bc_loss: demo_batch_size = self.config['demo_batch_size'] transitions = self.buffer.sample(batch_size - demo_batch_size, **sample_kwargs) transitions_demo = self.demo_buffer.sample(demo_batch_size) for k, values in transitions_demo.items(): rollout_vec = transitions[k].tolist() for v in values: rollout_vec.append(v.tolist()) transitions[k] = np.array(rollout_vec) else: transitions = self.buffer.sample(batch_size, **sample_kwargs) return transitions def save_checkpoint(self, epoch=0): local_dir = self.config.get('local_dir') if local_dir is not None: local_dir = local_dir + '/checkpoints' os.makedirs(local_dir, exist_ok=True) model_path = f'{local_dir}/model_{epoch}.pt' status_path = f'{local_dir}/status_{epoch}.pkl' torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor_network.state_dict() ], model_path) with open(status_path, 'wb') as f: pickle.dump(dict(config=self.config), f) @staticmethod def load(env, local_dir, epoch=None): epoch = epoch or '*[0-9]' models = glob.glob(f'{local_dir}/model_{epoch}.pt') assert len(models) > 0, "No checkpoints found!" model_path = sorted(models, key=os.path.getmtime)[-1] epoch = model_path.split("_")[-1].split(".")[0] status_path = f'{local_dir}/status_{epoch}.pkl' with open(status_path, 'rb') as f: status = pickle.load(f) status['config']['cuda'] = torch.cuda.is_available() agent = DdpgHer(env, status['config']) agent._trained = True o_mean, o_std, g_mean, g_std, actor_state = torch.load( model_path, map_location=lambda storage, loc: storage) agent.o_norm.mean = o_mean agent.o_norm.std = o_std agent.g_norm.mean = g_mean agent.g_norm.std = g_std agent.actor_network.load_state_dict(actor_state) agent.actor_network.eval() print(f'Loaded model for epoch {epoch}.') return agent def predict(self, obs): if not self._trained: raise RuntimeError g = obs['desired_goal'] obs = obs['observation'] with torch.no_grad(): inputs = self._preproc_inputs(obs, g) pi = self.actor_network(inputs) action = pi.cpu().numpy().squeeze() return action def train(self): if self._trained: raise RuntimeError # make sure that different workers have different seeds # (from baselines' original implementation) local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if MPI.COMM_WORLD.Get_rank() != 0: assert local_uniform[0] != root_uniform[0] tic = datetime.now() n_epochs = self.config.get('n_epochs') saved_checkpoints = 0 total_env_steps = 0 for iter_i in it.count(): if n_epochs is not None and iter_i >= n_epochs: break res = self._training_step() total_env_steps += res['env_steps'] if MPI.COMM_WORLD.Get_rank() == 0: if (iter_i + 1) % self.config['checkpoint_freq'] == 0: self.save_checkpoint(epoch=(iter_i + 1)) saved_checkpoints += 1 if callable(self.reporter): self.reporter( **{ **res, "training_iteration": iter_i + 1, "total_time": (datetime.now() - tic).total_seconds(), "checkpoints": saved_checkpoints, "total_env_steps": total_env_steps, "current_buffer_size": self.buffer.current_size, }) # pre_process the inputs def _preproc_inputs(self, obs, g): obs_norm = self.o_norm.normalize(obs) g_norm = self.g_norm.normalize(g) # concatenate the stuffs inputs = np.concatenate([obs_norm, g_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.config['cuda']: inputs = inputs.cuda() return inputs # this function will choose action for the agent and do the exploration def _select_actions(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += self.config['noise_eps'] * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) # random actions... random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], size=self.env_params['action']) # choose if use the random actions action += np.random.binomial(1, self.config['random_eps'], 1)[0] * (random_actions - action) return action # update the normalizer def _update_normalizer(self, episode_batch): mb_obs, mb_ag, mb_g, mb_actions = episode_batch mb_obs_next = mb_obs[:, 1:, :] mb_ag_next = mb_ag[:, 1:, :] # get the number of normalization transitions num_transitions = mb_actions.shape[1] # create the new buffer to store them buffer_temp = { 'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions, 'obs_next': mb_obs_next, 'ag_next': mb_ag_next, } transitions = self.her_module.sample_her_transitions( buffer_temp, num_transitions) obs, g = transitions['obs'], transitions['g'] # pre process the obs and g transitions['obs'], transitions['g'] = self._preproc_og(obs, g) # update self.o_norm.update(transitions['obs']) self.g_norm.update(transitions['g']) # recompute the stats self.o_norm.recompute_stats() self.g_norm.recompute_stats() def _preproc_og(self, o, g): o = np.clip(o, -self.config['clip_obs'], self.config['clip_obs']) g = np.clip(g, -self.config['clip_obs'], self.config['clip_obs']) return o, g # soft update def _soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.config['polyak']) * param.data + self.config['polyak'] * target_param.data) # update the network def _update_network(self, transitions): # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions[ 'obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self._preproc_og(o, g) transitions['obs_next'], transitions['g_next'] = self._preproc_og( o_next, g) # start to do the update obs_norm = self.o_norm.normalize(transitions['obs']) g_norm = self.g_norm.normalize(transitions['g']) inputs_norm = np.concatenate([obs_norm, g_norm], axis=1) obs_next_norm = self.o_norm.normalize(transitions['obs_next']) g_next_norm = self.g_norm.normalize(transitions['g_next']) inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1) # transfer them into the tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) r_tensor = torch.tensor(transitions['r'], dtype=torch.float32) if self.config['cuda']: inputs_norm_tensor = inputs_norm_tensor.cuda() inputs_next_norm_tensor = inputs_next_norm_tensor.cuda() actions_tensor = actions_tensor.cuda() r_tensor = r_tensor.cuda() # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) q_next_value = self.critic_target_network(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = r_tensor + self.config['gamma'] * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - self.config['gamma']) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # self.main.Q_tf ==> real_q_value # self.main.Q_pi_tf ==> self.critic_network(inputs_norm_tensor, actions_real) ==> approx_q_value # the actor loss action_l2 = self.config['action_l2'] actions_real = self.actor_network(inputs_norm_tensor) approx_q_value = self.critic_network(inputs_norm_tensor, actions_real) if self.bc_loss: # train with demonstrations using behavior cloning # choose only the demo buffer samples b_size = self.config['batch_size'] demo_b_size = self.config['demo_batch_size'] mask = np.concatenate( (np.zeros(b_size - demo_b_size), np.ones(demo_b_size)), axis=0) mask = torch.tensor(mask, dtype=torch.uint8, device=actions_real.device) if self.q_filter: # use Q-filter trick to perform BC only when needed with torch.no_grad(): mask &= (real_q_value > approx_q_value).squeeze() prm_loss_weight = self.config['prm_loss_weight'] cloning_loss = self.config['aux_loss_weight'] * ( actions_real[mask] - actions_tensor[mask]).pow(2).sum() else: # train without demonstrations prm_loss_weight = 1.0 cloning_loss = None actor_loss = -prm_loss_weight * approx_q_value.mean() actor_loss += prm_loss_weight * action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() if cloning_loss is not None: actor_loss += cloning_loss # update actor network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update critic network self.critic_optim.zero_grad() critic_loss.backward() sync_grads(self.critic_network) self.critic_optim.step() res = dict(actor_loss=actor_loss.item(), critic_loss=critic_loss.item()) if cloning_loss is not None: res['cloning_loss'] = cloning_loss.item() return res # do the evaluation def _eval_agent(self): total_success_rate = [] ep_rewards = [] for _ in range(self.config['n_test_rollouts']): per_success_rate = [] ep_reward = 0.0 observation = self.env.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() observation_new, rew, _, info = self.env.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] per_success_rate.append(info['is_success']) ep_reward += rew ep_rewards.append(ep_reward) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) global_success_rate /= MPI.COMM_WORLD.Get_size() avg_ep_reward = np.array(ep_rewards).mean() global_avg_ep_reward = MPI.COMM_WORLD.allreduce(avg_ep_reward, op=MPI.SUM) global_avg_ep_reward /= MPI.COMM_WORLD.Get_size() return global_success_rate, global_avg_ep_reward
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5) def policy_action(self, s, g): """ Use the actor to predict value """ input = self.preprocess_inputs(s, g) return self.actor_network(input) def memorize(self, experiences): """ Store experience in memory buffer """ for exp in experiences: self.buffer.push(exp) def sample_batch(self, batch_size): return deepcopy(self.buffer.sample(batch_size)) def clip_states_goals(self, state, goal): state = np.clip(state, -200, 200) goal = np.clip(goal, -200, 200) return state, goal def preprocess_inputs(self, state, goal): """Normalize and concatenate state and goal""" #state, goal = self.clip_states_goals(state, goal) state_norm = self.state_normalizer.normalize(state) goal_norm = self.goal_normalizer.normalize(goal) inputs = np.concatenate([state_norm, goal_norm]) return torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) def select_actions(self, pi): # add the gaussian action = pi.cpu().numpy().squeeze() action += 0.2 * self.act_range * np.random.randn(*action.shape) action = np.clip(action, -self.act_range, self.act_range) # random actions... random_actions = np.random.uniform(low=-self.act_range, high=self.act_range, size=self.act_dim) # choose if use the random actions action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action) action = np.clip(action, -self.act_range, self.act_range) return action def update_network(self, batch_size): s, actions, rewards, ns, _, g = self.sample_batch(batch_size) states, goals = self.clip_states_goals(s, g) new_states, new_goals = self.clip_states_goals(ns, g) norm_states = self.state_normalizer.normalize(states) norm_goals = self.goal_normalizer.normalize(goals) inputs_norm = np.concatenate([norm_states, norm_goals], axis=1) norm_new_states = self.state_normalizer.normalize(new_states) norm_new_goals = self.goal_normalizer.normalize(new_goals) inputs_next_norm = np.concatenate([norm_new_states, norm_new_goals], axis=1) # To tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(actions, dtype=torch.float32) r_tensor = torch.tensor(rewards, dtype=torch.float32) with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) q_next_value = self.critic_target_network(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = r_tensor + self.gamma * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - self.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor_network(inputs_norm_tensor) actor_loss = -self.critic_network(inputs_norm_tensor, actions_real).mean() actor_loss += 1.0 * (actions_real / self.act_range).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim.zero_grad() critic_loss.backward() sync_grads(self.critic_network) self.critic_optim.step() def soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.tau) * param.data + self.tau * target_param.data) def train(self, args): if MPI.COMM_WORLD.Get_rank() == 0: self.create_save_dir(args["save_dir"], args["env_name"], args["HER_strat"]) success_rates = [] for ep_num in range(NUM_EPOCHS): start = time.time() for _ in range(NUM_CYCLES): for _ in range(ROLLOUT_PER_WORKER): # Reset episode observation = self.env.reset() current_state = observation['observation'] goal = observation['desired_goal'] old_achieved_goal = observation['achieved_goal'] episode_exp = [] episode_exp_her = [] for _ in range(self.env._max_episode_steps): if args['render']: self.env.render() with torch.no_grad(): pi = self.policy_action(current_state, goal) action = self.select_actions(pi) obs, reward, _, _ = self.env.step(action) new_state = obs['observation'] new_achieved_goal = obs['achieved_goal'] # Add outputs to memory buffer episode_exp.append([ current_state, action, reward, new_state, old_achieved_goal, goal ]) if reward == 0: break old_achieved_goal = new_achieved_goal current_state = new_state if args["HER_strat"] == "final": experience = episode_exp[-1] # set g' to achieved goal experience[-1] = np.copy(experience[-2]) reward = self.env.compute_reward( experience[-2], experience[-1], None) # set reward of success experience[2] = reward episode_exp_her.append(experience) elif args["HER_strat"] in ["future", "episode"]: # For each transition of the episode trajectory for t in range(len(episode_exp)): # Add K random states which come from the same episode as the transition for _ in range(args["HER_k"]): if args["HER_strat"] == "future": # Select a future exp from the same episod selected = np.random.randint( t, len(episode_exp)) elif args["HER_strat"] == "episode": # Select an exp from the same episode selected = np.random.randint( 0, len(episode_exp)) # Take the achieved goal of the selected ag_selected = np.copy(episode_exp[selected][5]) s, a, _, ns, ag, _ = episode_exp[t] r = self.env.compute_reward( ag_selected, ag, None) # New transition where the achieved goal of the selected is the new goal her_transition = [s, a, r, ns, ag, ag_selected] episode_exp_her.append(her_transition) self.memorize(deepcopy(episode_exp)) self.memorize(deepcopy(episode_exp_her)) # Update Normalizers with the observations of this episode self.update_normalizers(deepcopy(episode_exp), deepcopy(episode_exp_her)) for _ in range(OPTIMIZATION_STEPS): # Sample experience from buffer self.update_network(args["batch_size"]) # Soft update the target networks self.soft_update_target_network(self.actor_target_network, self.actor_network) self.soft_update_target_network(self.critic_target_network, self.critic_network) success_rate = self.eval() success_rates.append(success_rate) if MPI.COMM_WORLD.Get_rank() == 0: print("Epoch:", ep_num + 1, " -- success rate:", success_rates[-1], " -- duration:", time.time() - start) torch.save([ self.state_normalizer.mean, self.state_normalizer.std, self.goal_normalizer.mean, self.goal_normalizer.std, self.actor_network.state_dict() ], self.model_path + '/model.pt') return success_rates def create_save_dir(self, save_dir, env_name, her_strat): if not os.path.exists(save_dir): os.mkdir(save_dir) # path to save the model subdir = os.path.join(save_dir, env_name) if not os.path.exists(subdir): os.mkdir(subdir) self.model_path = os.path.join(save_dir, env_name, her_strat) if not os.path.exists(self.model_path): os.mkdir(self.model_path) def update_normalizers(self, episode_exp, episode_exp_her): # Update Normalizers episode_exp_states = np.vstack(np.array(episode_exp)[:, 0]) episode_exp_goals = np.vstack(np.array(episode_exp)[:, 5]) if len(episode_exp_her) != 0: episode_exp_her_states = np.vstack(np.array(episode_exp_her)[:, 0]) episode_exp_her_goals = np.vstack(np.array(episode_exp_her)[:, 5]) states = np.concatenate( [episode_exp_states, episode_exp_her_states]) goals = np.concatenate([episode_exp_goals, episode_exp_her_goals]) else: states = np.copy(episode_exp_states) goals = np.copy(episode_exp_goals) states, goals = self.clip_states_goals(states, goals) self.state_normalizer.update(deepcopy(states)) self.goal_normalizer.update(deepcopy(goals)) self.state_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() def eval(self): total_success_rate = [] for _ in range(NUM_TEST): per_success_rate = [] observation = self.env.reset() state = observation['observation'] goal = observation['desired_goal'] for _ in range(self.env._max_episode_steps): # self.env.render() with torch.no_grad(): input = self.preprocess_inputs(state, goal) pi = self.actor_network(input) action = pi.detach().cpu().numpy().squeeze() new_observation, _, _, info = self.env.step(action) state = new_observation['observation'] per_success_rate.append(info['is_success']) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()