def __init__(self, policy_fn, agents, dims, logger, make_env, T, use_her, rollout_batch_size=1, compute_Q=False, render=False, history_len=100): """Rollout worker generates experience by interacting with one or many environments. Args: make_env (function): a factory function that creates a new instance of the environment when called policy (object): the policy that is used to act dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u) logger (object): the logger that is used by the rollout worker rollout_batch_size (int): the number of parallel rollouts that should be used exploit (boolean): whether or not to exploit, i.e. to act optimally according to the current policy without any exploration use_target_net (boolean): whether or not to use the target net for rollouts compute_Q (boolean): whether or not to compute the Q values alongside the actions noise_eps (float): scale of the additive Gaussian noise random_eps (float): probability of selecting a completely random action history_len (int): length of history for statistics smoothing render (boolean): whether or not to render the rollouts """ self.envs = [make_env() for _ in range(rollout_batch_size)] assert (np.abs(self.envs[0].action_space.low) == self.envs[0].action_space.high).all() # we assume symmetric actions. self.max_action = self.envs[0].action_space.high logger.info('Scaling actions by {} before executing in env'.format(self.max_action)) assert self.T > 0 self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')] if self.use_her: self.success_history = deque(maxlen=history_len) self.reward_per_episode_history = deque(maxlen=history_len) self.Q_history = deque(maxlen=history_len) self.n_episodes = 0 self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations if self.use_her: self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals self.total_reward_this_episode = np.zeros((self.rollout_batch_size,), np.float32) self.reset_all(force_env_resets=True) self.clear_history() self.current_heatmap_prefix = None self.recording = False
def display_var_info(vars): from ddpg_curiosity_mc_her import logger count_params = 0 for v in vars: name = v.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(v.shape.as_list()) count_params += v_params if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(v.shape))) logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
def __init__(self, obs0, action, obs1, clip_norm, hidden, layers, comm): logger.info("Using Forward Dynamics") assert hidden is not None assert layers is not None with tf.variable_scope('forward_dynamics'): self.dynamics_scope = tf.get_variable_scope().name input = tf.concat(values=[obs0, action], axis=-1) next_state_tf = nn(input, [hidden] * layers + [obs1.shape[-1]]) # loss functions self.per_sample_loss_tf = tf.expand_dims(tf.reduce_mean(tf.square(next_state_tf - obs1), axis=1), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False, comm=comm)
def configure_replay_buffer(params): logger.info('Using Replay Buffer') sample_transitions = configure_her(params) input_dims = configure_dims(params) input_shapes = dims_to_shapes(input_dims) buffer_shapes = { key: (params['T'] if key != 'o' else params['T'] + 1, *input_shapes[key]) for key, val in input_shapes.items() } if params['use_her']: buffer_shapes['g'] = (buffer_shapes['g'][0], input_dims['g']) buffer_shapes['ag'] = (params['T'] + 1, input_dims['g']) else: buffer_shapes['r'] = (params['T'], 1) buffer_shapes['t'] = (params['T'], 1) buffer_size = (params['buffer_size'] // params['rollout_batch_size']) * params['rollout_batch_size'] return ReplayBuffer(buffer_shapes, buffer_size, params['T'], sample_transitions, params['use_her'])
def run(args): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # If we are supposed to divide gpu usage among a specific set of devices, # set this processes' device to the correct one. gpu_nums = args['split_gpu_usage_among_device_nums'] if gpu_nums is not None: gpu_num_to_use = gpu_nums[rank % len(gpu_nums)] os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_num_to_use) # Seed everything to make things reproducible. rank_seed = args['seed'] + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, rank_seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(rank_seed) input_dims = configure_dims(args) # Configure the replay buffer. memory = configure_memory(args) with U.single_threaded_session() as sess: # Setup up DDPG Agents agents = create_agents(sess=sess, memory=memory, input_dims=input_dims, params=args) saver = tf.train.Saver() if args['restore_from_ckpt'] is not None: logger.info("Restoring agents from {}".format(args['restore_from_ckpt'])) saver.restore(sess, args['restore_from_ckpt']) sess.graph.finalize() logger.log_graph_to_tensorboard(sess.graph) # Setup Rollout workers train_policy_fn = get_policy_fn( name=args['train_policy_fn'], agents=agents ) eval_policy_fn = get_policy_fn( name=args['eval_policy_fn'], agents=agents ) train_rollout_worker = configure_rollout_worker( role='train', policy_fn=train_policy_fn, agents=agents, dims=input_dims, seed=rank_seed, logger=logger, params=args ) eval_rollout_worker = configure_rollout_worker( role='eval', policy_fn=eval_policy_fn, agents=agents, dims=input_dims, seed=rank_seed, logger=logger, params=args ) # Begin main training loop if rank == 0: start_time = time.time() if args['do_demo_only'] is False: training.train( memory=memory, agents=agents, saver=saver, sess=sess, train_rollout_worker=train_rollout_worker, eval_rollout_worker=eval_rollout_worker, param_noise_adaption_interval=50, **args ) else: demo.demo(agents=agents, eval_rollout_worker=eval_rollout_worker, demo_video_recording_name=args["demo_video_recording_name"]) train_rollout_worker.close() eval_rollout_worker.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def configure_ddpg_agent(sess, role, memory, input_dims, external_critic_fn, params): input_shapes = dims_to_shapes(input_dims) observation_shape = input_shapes['o'] goal_shape = input_shapes['g'] if params['use_her'] else None action_shape = input_shapes['u'] action_dim = input_dims['u'] if role == 'exploit': comm = MPI.COMM_WORLD use_goals = True if params['use_her'] else False use_intrinsic_reward = False dynamics_loss_mapper = None mix_external_critic_with_internal = None external_critic_fn = None elif role == 'explore': comm = params['explore_comm'] assert comm != MPI.COMM_WORLD use_intrinsic_reward = True dynamics_loss_mapper = params['dynamics_loss_mapper'] mix_external_critic_with_internal = params[ 'mix_extrinsic_intrinsic_objectives_for_explore'] if mix_external_critic_with_internal is not None: assert len(mix_external_critic_with_internal) == 2 assert external_critic_fn is not None use_goals = True if params['use_her'] else False else: use_goals = False external_critic_fn = None else: raise ValueError('role must either be \'exploit\' or \'explore\'.') agent = DDPG( sess=sess, scope=role + '_ddpg', layer_norm=[role + '_use_layer_norm'], nb_actions=action_dim, memory=memory, observation_shape=observation_shape, action_shape=action_shape, goal_shape=goal_shape, param_noise=params[role + '_param_noise'], action_noise=params[role + '_action_noise'], gamma=params[role + '_gamma'], tau=params[role + '_polyak_tau'], normalize_returns=params[role + '_normalize_returns'], enable_popart=params[role + '_popart'], normalize_observations=params['agents_normalize_observations'], normalize_goals=params['agents_normalize_goals'], batch_size=params['batch_size'], observation_range=(-5., 5.), goal_range=(-200, 200), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=params[role + '_critic_l2_reg'], actor_lr=params[role + '_pi_lr'], critic_lr=params[role + '_Q_lr'], clip_norm=None, reward_scale=1., use_intrinsic_reward=use_intrinsic_reward, use_goals=use_goals, agent_hidden_layer_sizes=[params[role + '_hidden']] * params[role + '_layers'], dynamics_hidden=params['dynamics_hidden'], dynamics_layers=params['dynamics_layers'], dynamics_normalize_observations=params[ 'dynamics_normalize_observations'], dynamics_loss_mapper=dynamics_loss_mapper, mix_external_critic_with_internal=mix_external_critic_with_internal, external_critic_fn=external_critic_fn, intrinsic_motivation_method=params['intrinsic_motivation_method'], comm=comm) logger.info('Using ' + role + ' agent.') # logger.info('Using ' + role + ' agent with the following configuration:') # logger.info(str(agent.__dict__.items())) return agent
def log_params(params, logger=logger): for key in sorted(params.keys()): logger.info('{}: {}'.format(key, params[key]))
def train(memory, agents, saver, sess, train_rollout_worker, eval_rollout_worker, n_epochs, n_cycles, n_batches, batch_size, rollout_batches_per_cycle, n_test_rollouts, heatmaps, dynamics_loss_mapper, do_evaluation, save_at_score, stop_at_score, save_checkpoints_at, **kwargs): rank = MPI.COMM_WORLD.Get_rank() logger.info("Training...") batch = 0 should_quit_early = False for epoch in range(1, n_epochs + 1): epoch_start_time = datetime.now() if dynamics_loss_mapper is not None: dynamics_loss_mapper.set_record_write( prefix='epoch{}_rank{}'.format(epoch, rank)) # train train_rollout_worker.clear_history() for cycle_index in range(n_cycles): for _ in range(rollout_batches_per_cycle): episode = train_rollout_worker.generate_rollouts( render_override=False, heatmap_prefix='epoch{}_rank{}'.format(epoch, rank) if heatmaps else None) memory.store_episode(episode) for agent in agents.values(): agent.update_normalizers(episode) param_noise_distances = {} # Adapt param noise. if memory.nb_entries >= batch_size: for role, agent in agents.items(): param_noise_distances[role] = agent.adapt_param_noise() for train_step in range(n_batches): critic_losses = {} actor_losses = {} for role, agent in agents.items(): critic_losses[role], actor_losses[role] = agent.train() for agent in agents.values(): agent.update_target_net() batch += 1 if heatmaps: train_rollout_worker.flush_env_location_records() MPI.COMM_WORLD.Barrier() logger.info("Creating heatmap...") if rank == 0: heatmap_save_path = generate_3d_fetch_stack_heatmap_from_npy_records( working_dir=os.path.join(logger.get_dir(), 'heatmaps'), file_prefix='epoch{}'.format(epoch), delete_records=True) logger.info("Heatmap saved to {}".format(heatmap_save_path)) # test if do_evaluation: eval_rollout_worker.clear_history() for _ in range(n_test_rollouts): eval_rollout_worker.generate_rollouts() current_score = mpi_average(eval_rollout_worker.current_score()) if current_score >= save_at_score and rank == 0: save_path = os.path.join(logger.get_dir(), 'saved_model', 'model.ckpt') logger.info("Saving models to {}".format(save_path)) saver.save(sess, save_path) if save_checkpoints_at is not None: for score in save_checkpoints_at.copy(): if current_score >= score and rank == 0: logger.info("Reached checkpoint for {}".format(score)) save_path = os.path.join( logger.get_dir(), 'saved_model', 'model_score_{}.ckpt'.format( str(score).replace(".", "p"))) logger.info("Saving models to {}".format(save_path)) saver.save(sess, save_path) save_checkpoints_at.remove(score) if stop_at_score is not None and current_score >= stop_at_score: logger.info("Stopping score of {} reached. Quitting...".format( stop_at_score)) should_quit_early = True # record logs logger.record_tabular('epoch', epoch) timesteps = MPI.COMM_WORLD.Get_size( ) * epoch * n_cycles * rollout_batches_per_cycle * train_rollout_worker.rollout_batch_size * train_rollout_worker.T logger.record_tabular('timesteps', timesteps) if do_evaluation: for key, val in eval_rollout_worker.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in train_rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for role, agent in agents.items(): for key, val in agent.get_stats().items(): logger.record_tabular("{}_agent_{}".format(role, key), mpi_average(val)) if rank == 0: logger.dump_tabular() # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0] epoch_end_time = datetime.now() if rank == 0: logger.info("(epoch took {} seconds)".format( (epoch_end_time - epoch_start_time).total_seconds())) logger.info("(completed at {})".format(epoch_end_time)) if should_quit_early: break if rank == 0: save_path = os.path.join(logger.get_dir(), 'saved_model', 'model.ckpt') logger.info("Saving models to {}".format(save_path)) saver.save(sess, save_path)
def __init__(self, obs0, action, obs1, clip_norm, hidden, layers): logger.info("Using Random Network Distillation") rep_size = hidden # RND bonus. # Random target network. # for ph in self.ph_ob.values(): # if len(ph.shape.as_list()) == 5: # B,T,H,W,C # logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) # xr = ph[:, 1:] # xr = tf.cast(xr, tf.float32) # xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] # xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) # # xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) # xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) # xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) # rgbr = [to2d(xr)] # X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) with tf.variable_scope('random_network_distillation'): self.rnd_scope = tf.get_variable_scope().name # Random Target Network with tf.variable_scope('target_network'): xr = nn(obs1, [hidden] * layers + [rep_size]) # # xr = tf.nn.leaky_relu(fc(obs1, "fc1r", nh=hidden*2, init_scale=np.sqrt(2))) # xr = tf.nn.leaky_relu(fc(xr, "fc2r", nh=hidden*2, init_scale=np.sqrt(2))) # xr = tf.nn.leaky_relu(fc(xr, "fc3r", nh=hidden, init_scale=np.sqrt(2))) # xr = tf.nn.relu(fc(xr, "fc4r", nh=hidden, init_scale=np.sqrt(2))) # xr = tf.nn.relu(fc(xr, "fc5r", nh=hidden, init_scale=np.sqrt(2))) # xr = fc(xr, "fc6r", nh=rep_size, init_scale=np.sqrt(2)) with tf.variable_scope('predictor_network'): self.predictor_scope = tf.get_variable_scope().name # Predictor network. # xr_hat = tf.nn.leaky_relu(fc(obs1, "fcr_hat1", nh=hidden*2, init_scale=np.sqrt(2))) # xr_hat = tf.nn.leaky_relu(fc(xr_hat, "fcr_hat2", nh=hidden*2, init_scale=np.sqrt(2))) # xr_hat = tf.nn.leaky_relu(fc(xr_hat, "fcr_hat3", nh=hidden, init_scale=np.sqrt(2))) # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat4", nh=hidden, init_scale=np.sqrt(2))) # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat5", nh=hidden, init_scale=np.sqrt(2))) # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat6", nh=hidden, init_scale=np.sqrt(2))) # xr_hat = tf.nn.relu(fc(xr_hat, "fcr_hat7", nh=hidden, init_scale=np.sqrt(2))) # xr_hat = fc(xr_hat, "fcr_hat8", nh=rep_size, init_scale=np.sqrt(2)) xr_hat = nn(obs1, [hidden] * layers + [rep_size]) # # Predictor network. # for ph in self.ph_ob.values(): # if len(ph.shape.as_list()) == 5: # B,T,H,W,C # logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) # xrp = ph[:, 1:] # xrp = tf.cast(xrp, tf.float32) # xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] # xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) # # xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) # xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) # xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) # rgbrp = to2d(xrp) # # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) # X_r_hat = tf.nn.relu(fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) # X_r_hat = tf.nn.relu(fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) # X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) # self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) # self.max_feat = tf.reduce_max(tf.abs(X_r)) # self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keep_dims=True) # targets = tf.stop_gradient(X_r) # # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) # self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) # mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) # mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) # self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.) # total_parameters = 0 for variable in _vars(self.predictor_scope): # shape is an array of tf.Dimension shape = variable.get_shape() # print(shape) # print(len(shape)) variable_parameters = 1 for dim in shape: # print(dim) variable_parameters *= dim.value # print(variable_parameters) total_parameters += variable_parameters logger.info("params in target rnd network: {}".format(total_parameters)) self.feat_var = tf.reduce_mean(tf.nn.moments(xr, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(xr)) # loss functions self.per_sample_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keepdims=True) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.predictor_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.predictor_scope), scale_grad_by_procs=False)