def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda and torch.cuda.is_available(): self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # add tensorboardX tool # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) print('model path', self.model_path) # self.writer = SummaryWriter('./logs') self.reward_list = [] self.reward_record = [] self.success_rate_list = [] self.success_list = []
def __init__(self, args, env, env_params, image=True): self.args = args self.env = env self.env_params = env_params self.image = image # create the network if self.image: self.actor_network = actor_image(env_params, env_params['obs']) self.critic_network = critic_image(env_params, env_params['obs'] + env_params['action']) else: self.actor_network = actor(env_params, env_params['obs']) self.critic_network = critic(env_params, env_params['obs'] + env_params['action']) # load model if load_path is not None if self.args.load_dir != '': actor_load_path = self.args.load_dir + '/actor.pt' model = torch.load(actor_load_path) self.actor_network.load_state_dict(model) critic_load_path = self.args.load_dir + '/critic.pt' model = torch.load(critic_load_path) self.critic_network.load_state_dict(model) # sync the networks across the cpus # sync_networks(self.actor_network) # sync_networks(self.critic_network) # build up the target network # if self.image: # self.actor_target_network = actor_image(env_params, env_params['obs']) # else: # self.actor_target_network = actor(env_params, env_params['obs']) # # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() # self.actor_target_network.cuda() self.critic_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env().compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, image=self.image) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus #sync_networks(self.actor_network) #sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model #if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training #self.folder_siffix = '_' + self.args.replay_strategy + '_' + self.args.env_params.reward_type # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.savetime = 0 self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # 是否加入示教数据 if self.args.add_demo: self._init_demo_buffer( ) # initialize replay buffer with demonstration # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # load the data to continue the training # model_path = "saved_models/bmirobot-v3/125_True12_model.pt" # # # model_path = args.save_dir + args.env_name + '/' + str(args.seed) + '_' + str(args.add_demo) + '_model.pt' # # o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # self.actor_network.load_state_dict(model) # self.o_norm.mean=o_mean # self.o_norm.std=o_std # self.g_norm.mean=g_mean # self.g_norm.std=g_std self.success_rates = [] # 记录每个epoch的成功率 # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # Load the model if required if args.load_path != None: o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load( args.load_path, map_location=lambda storage, loc: storage) self.actor_network.load_state_dict(load_actor_model) self.critic_network.load_state_dict(load_critic_model) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer if self.args.replay_strategy == 'future': self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) else: self.buffer = replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_normal_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # makeup a suffix for the model path to indicate which method is used for Training buffer_len_epochs = int( self.args.buffer_size / (env_params['max_timesteps'] * self.args.num_rollouts_per_cycle * self.args.n_cycles)) name_add_on = '' if self.args.exploration_strategy == 'pgg': if self.args.pgg_strategy == 'final': if self.args.replay_strategy == 'future': name_add_on = '_final_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_final_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_distance_based_goal_generation_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_distance_based_goal_generation_withoutHER_buffer' + str( buffer_len_epochs) + 'epochs' else: if self.args.replay_strategy == 'future': name_add_on = '_originalHER_buffer' + str( buffer_len_epochs) + 'epochs' else: name_add_on = '_originalDDPG_buffer' + str( buffer_len_epochs) + 'epochs' # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + name_add_on) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.model_path = os.path.join(self.model_path, 'seed_' + str(self.args.seed)) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, envs_lst, env_params, expert_lst_dir, recurrent=True, ee_reward=True, image=True): self.args = args self.envs_lst = envs_lst self.env_params = env_params self.recurrent = recurrent self.ee_reward = ee_reward self.image = image # initialize expert self.expert_lst = [] for dir in expert_lst_dir: expert_load_path = dir + '/model.pt' o_mean, o_std, g_mean, g_std, model = torch.load(expert_load_path) expert_model = actor(env_params, env_params['obs'] + env_params['goal']) expert_model.load_state_dict(model) self.expert_lst.append({ "model": expert_model, "o_mean": o_mean, "o_std": o_std, "g_mean": g_mean, "g_std": g_std }) # create the network if self.recurrent: self.actor_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.sg_norm = normalizer(size=env_params['action'], default_clip_range=self.args.clip_range) # load model if load_path is not None if self.args.load_dir != '': load_path = self.args.load_dir + '/model.pt' # o_mean, o_std, g_mean, g_std, sg_mean, sg_std, model = torch.load(load_path) o_mean, o_std, g_mean, g_std, model = torch.load(load_path) self.o_norm.mean = o_mean self.o_norm.std = o_std self.g_norm.mean = g_mean self.g_norm.std = g_std # self.sg_norm.mean = sg_mean # self.sg_norm.std = sg_std self.actor_network.load_state_dict(model) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network if self.recurrent: self.actor_target_network = actor_recurrent( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) # self.critic_target_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action']) else: self.actor_target_network = actor( env_params, env_params['obs'] + env_params['goal'] + env_params['action'], env_params['goal']) self.critic_target_network = critic( env_params, env_params['obs'] + 2 * env_params['goal'] + env_params['action']) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module_lst = [ her_sampler(self.args.replay_strategy, self.args.replay_k, env.compute_reward) for env in self.envs_lst ] # create the replay buffer self.buffer_lst = [ replay_buffer(self.env_params, self.args.buffer_size, her_module.sample_her_transitions, ee_reward=True) for her_module in self.her_module_lst ] # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name)