def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join((self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join(self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) else: device = 'cpu' self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network.cuda(self.device) self.actor_target_network.cuda(self.device) self.critic_target_network.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer if self.args.optimizer_type =='SGD': self.actor_optim = torch.optim.SGD(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.SGD(self.critic_network.parameters(), lr=self.args.lr_critic) elif self.args.optimizer_type =='adam': self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.scales = [] # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.result_dir = f'./learning_curves/{args.env_name}/{self.args.run_name}' if not os.path.isdir(self.result_dir): os.makedirs(self.result_dir, exist_ok=True) print(f'creating {self.result_dir}') self.writer = SummaryWriter(logdir=self.result_dir)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(args, env_params) self.critic_network = critic(args, env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(args, env_params) self.critic_target_network = critic(args, env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) # added log_dir = create_env_folder(args.env_name, args.network_class, test=args.test) save_kwargs(vars(args), log_dir) tabular_log_path = osp.join(log_dir, 'progress.csv') text_log_path = osp.join(log_dir, 'debug.log') logger.add_text_output(text_log_path) logger.add_tabular_output(tabular_log_path) exp_name = f'{args.env_name}' logger.push_prefix("[%s] " % exp_name)
def demo_2_envs(env, args, env_id): # load the model param model_path = args.save_dir + args.env1_name + args.env2_name + '/' + args.save_name o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # get the env param observation = env.reset() # get the environment params env_params = {'obs': ddpg_agent.inject_obs(observation['observation'], env_id, args).shape[0], 'goal': observation['desired_goal'].shape[0], 'action': env.action_space.shape[0], 'action_max': env.action_space.high[0], } try: # create the actor network actor_network = actor(env_params) actor_network.load_state_dict(model) actor_network.eval() except Exception as e: args.dont_inject_observation = not args.dont_inject_observation # get the environment params env_params['obs'] = ddpg_agent.inject_obs(observation['observation'], env_id, args).shape[0] # create the actor network actor_network = actor(env_params) actor_network.load_state_dict(model) actor_network.eval() for i in range(args.demo_length): observation = env.reset() # start to do the demo obs = ddpg_agent.inject_obs(observation['observation'], env_id, args) g = observation['desired_goal'] for t in range(env._max_episode_steps): env.render() inputs = process_inputs(obs, g, o_mean, o_std, g_mean, g_std, args) with torch.no_grad(): pi = actor_network(inputs) action = pi.detach().numpy().squeeze() # put actions into the environment observation_new, reward, _, info = env.step(action) obs = ddpg_agent.inject_obs(observation_new['observation'], env_id, args) print('the episode is: {}, is success: {}'.format(i, info['is_success']))
model_path = args.save_dir + args.env_name + '/model.pt' o_mean, o_std, g_mean, g_std, model = torch.load( model_path, map_location=lambda storage, loc: storage) # create the environment env = gym.make(args.env_name) # get the env param observation = env.reset() # get the environment params env_params = { 'obs': observation['observation'].shape[0], 'goal': observation['desired_goal'].shape[0], 'action': env.action_space.shape[0], 'action_max': env.action_space.high[0], } # create the actor network actor_network = actor(env_params) actor_network.load_state_dict(model) actor_network.eval() safe_path('./images') for i in range(args.demo_length): observation = env.reset() # start to do the demo obs = observation['observation'] g = observation['desired_goal'] epi_path = safe_path('./images/epi{}'.format(i)) for t in range(env._max_episode_steps): # env.render() path = os.path.join(epi_path, 'img_{}.jpg'.format(t)) img = env.sim.render(mode='offscreen', camera_name='external_camera_0', width=256,
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params sim = self.env.sim self.viewer = MjRenderContextOffscreen(sim) # self.viewer.cam.fixedcamid = 3 # self.viewer.cam.type = const.CAMERA_FIXED self.critic_loss = [] self.actor_loss = [] self.viewer.cam.distance = 1.2 self.viewer.cam.azimuth = 180 self.viewer.cam.elevation = -25 env.env._viewers['rgb_array'] = self.viewer self.env_params = env_params self.image_based = True if args.image else False print("Training image based RL ? : {}".format(self.image_based)) # create the network if not self.image_based: self.actor_network = actor(env_params) else: self.actor_network = new_actor(env_params) #self.actor_network = resnet_actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network if not self.image_based: self.actor_target_network = actor(env_params) else: #self.actor_target_network = resnet_actor(env_params) self.actor_target_network = new_actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: print("use the GPU") self.actor_network.cuda(MPI.COMM_WORLD.Get_rank()) self.critic_network.cuda(MPI.COMM_WORLD.Get_rank()) self.actor_target_network.cuda(MPI.COMM_WORLD.Get_rank()) self.critic_target_network.cuda(MPI.COMM_WORLD.Get_rank()) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward, self.image_based) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, self.image_based) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env1, env2, env1_params, env2_params): self.args = args self.env1 = env1 self.env2 = env2 self.env1_params = env1_params self.env2_params = env2_params if not self.args.dont_inject_observation: self.env2_params['obs'] += 1 self.env1_params['obs'] += 1 self.train_mode = TrainMode(args.training_mode) # store weights and biases API key if in args if self.args.wandb_api_key is not None: os.environ["WANDB_API_KEY"] = self.args.wandb_api_key # if key is present set a flag to enable the functionality self.use_wandb_log = os.environ.get("WANDB_API_KEY") is not None # create the network assert env1_params == env2_params # TODO: make sure to check for equality self.actor_network = actor(env1_params) self.critic_network = critic(env1_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env1_params) self.critic_target_network = critic(env1_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # setup dual critic if applicable self.use_two_critics = self.args.dual_critic if self.use_two_critics: self.critic_network2 = critic(env1_params) sync_networks(self.critic_network2) self.critic_target_network2 = critic(env1_params) self.critic_target_network2.load_state_dict(self.critic_network2.state_dict()) self.critic2_optim = torch.optim.Adam(self.critic_network2.parameters(), lr=self.args.lr_critic) if self.args.cuda: self.critic_network2.cuda() self.critic_target_network2.cuda() # her sampler self.her_module1 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env1.compute_reward) self.her_module2 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env2.compute_reward) # create the replay buffer self.buffer1 = replay_buffer(self.env1_params, self.args.buffer_size, self.her_module1.sample_her_transitions) self.buffer2 = replay_buffer(self.env2_params, self.args.buffer_size, self.her_module2.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env1_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env1_params['goal'], default_clip_range=self.args.clip_range) # create the dict for storing the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env1_name + self.args.env2_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # check whether to continue training or start new if args.continue_training is None: self.continueTraining = False else: self.continueTraining = args.continue_training # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # build up the target network dash = "-"*42 if MPI.COMM_WORLD.Get_rank() == 0: print("env.spec.id: ", env.spec.id) print("args: ") d_args = vars(args) print(dash) print("{:<25s}{:<15s}".format("ARGS", "VALUE")) for key in d_args: if d_args[key] is not None: print("|{:<22s} | {:<15}|".format(key, d_args[key])) print(dash) print("env_inits: ") print("{:<25s}{:<15s}".format("ENV_INIT", "VALUE")) for key in env.env.inits: print("|{:<22s} | {:<15}|".format(key, env.env.inits[key])) print(dash) print("env_dimensions: ") for key in env_params: print("|{:<22s} | {:<15}|".format(key, env_params[key])) print(dash) #print("env_params", env_params) if self.continueTraining: if MPI.COMM_WORLD.Get_rank() == 0: print("CONTINUE TRAINING...") env_name = env.spec.id saved_dicts = load_saved_state_dicts( args.save_dir, env_name, MPI.COMM_WORLD.Get_rank()) self.actor_network.load_state_dict(saved_dicts['actor']) self.critic_network.load_state_dict(saved_dicts['critic']) self.critic_target_network.load_state_dict( saved_dicts['critic_target']) self.actor_target_network.load_state_dict( saved_dicts['actor_target']) else: # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam( self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam( self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler( self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer( size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer( size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join( self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)