def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params self.cauchy = Cauchy(torch.tensor([0.0]), torch.tensor([0.5])) # create the network self.forward_network = ForwardMap(env_params, args.embed_dim) self.backward_network = BackwardMap(env_params, args.embed_dim) # build up the target network self.forward_target_network = ForwardMap(env_params, args.embed_dim) self.backward_target_network = BackwardMap(env_params, args.embed_dim) # load the weights into the target networks self.forward_target_network.load_state_dict( self.forward_network.state_dict()) self.backward_target_network.load_state_dict( self.backward_network.state_dict()) # if use gpu if self.args.cuda: self.forward_network.cuda() self.backward_network.cuda() self.forward_target_network.cuda() self.backward_target_network.cuda() # create the optimizer f_params = [param for param in self.forward_network.parameters()] b_params = [param for param in self.backward_network.parameters()] self.fb_optim = torch.optim.Adam(f_params + b_params, lr=self.args.lr) # self.backward_optim = torch.optim.Adam(self.backward_network.parameters(), lr=self.args.lr_backward) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) if args.save_dir is not None: # create the dict for store the model if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) print(' ' * 26 + 'Options') for k, v in vars(self.args).items(): print(' ' * 26 + k + ': ' + str(v)) with open(self.args.save_dir + "/arguments.pkl", 'wb') as f: pickle.dump(self.args, f) with open('{}/score_monitor.csv'.format(self.args.save_dir), "wt") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow([ 'epoch', 'eval', 'avg dist', 'eval (GPI)', 'avg dist (GPI)' ])
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer if self.args.optimizer_type =='SGD': self.actor_optim = torch.optim.SGD(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.SGD(self.critic_network.parameters(), lr=self.args.lr_critic) elif self.args.optimizer_type =='adam': self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.scales = [] # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.result_dir = f'./learning_curves/{args.env_name}/{self.args.run_name}' if not os.path.isdir(self.result_dir): os.makedirs(self.result_dir, exist_ok=True) print(f'creating {self.result_dir}') self.writer = SummaryWriter(logdir=self.result_dir)
def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join((self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join(self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) else: device = 'cpu' self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network.cuda(self.device) self.actor_target_network.cuda(self.device) self.critic_target_network.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.actor_network = actor(args, env_params) self.critic_network = critic(args, env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(args, env_params) self.critic_target_network = critic(args, env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) # added log_dir = create_env_folder(args.env_name, args.network_class, test=args.test) save_kwargs(vars(args), log_dir) tabular_log_path = osp.join(log_dir, 'progress.csv') text_log_path = osp.join(log_dir, 'debug.log') logger.add_text_output(text_log_path) logger.add_tabular_output(tabular_log_path) exp_name = f'{args.env_name}' logger.push_prefix("[%s] " % exp_name)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # create the network self.critic_network = critic(env_params) # build up the target network self.critic_target_network = critic(env_params) # load the weights into the target networks self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.critic_network.cuda() self.critic_target_network.cuda() # create the optimizer self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if args.save_dir is not None: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) print(' ' * 26 + 'Options') for k, v in vars(self.args).items(): print(' ' * 26 + k + ': ' + str(v)) with open(self.args.save_dir + "/arguments.pkl", 'wb') as f: pickle.dump(self.args, f) with open('{}/score_monitor.csv'.format(self.args.save_dir), "wt") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow(['epoch', 'eval', 'dist'])
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params sim = self.env.sim self.viewer = MjRenderContextOffscreen(sim) # self.viewer.cam.fixedcamid = 3 # self.viewer.cam.type = const.CAMERA_FIXED self.critic_loss = [] self.actor_loss = [] self.viewer.cam.distance = 1.2 self.viewer.cam.azimuth = 180 self.viewer.cam.elevation = -25 env.env._viewers['rgb_array'] = self.viewer self.env_params = env_params self.image_based = True if args.image else False print("Training image based RL ? : {}".format(self.image_based)) # create the network if not self.image_based: self.actor_network = actor(env_params) else: self.actor_network = new_actor(env_params) #self.actor_network = resnet_actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network if not self.image_based: self.actor_target_network = actor(env_params) else: #self.actor_target_network = resnet_actor(env_params) self.actor_target_network = new_actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: print("use the GPU") self.actor_network.cuda(MPI.COMM_WORLD.Get_rank()) self.critic_network.cuda(MPI.COMM_WORLD.Get_rank()) self.actor_target_network.cuda(MPI.COMM_WORLD.Get_rank()) self.critic_target_network.cuda(MPI.COMM_WORLD.Get_rank()) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward, self.image_based) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, self.image_based) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env1, env2, env1_params, env2_params): self.args = args self.env1 = env1 self.env2 = env2 self.env1_params = env1_params self.env2_params = env2_params if not self.args.dont_inject_observation: self.env2_params['obs'] += 1 self.env1_params['obs'] += 1 self.train_mode = TrainMode(args.training_mode) # store weights and biases API key if in args if self.args.wandb_api_key is not None: os.environ["WANDB_API_KEY"] = self.args.wandb_api_key # if key is present set a flag to enable the functionality self.use_wandb_log = os.environ.get("WANDB_API_KEY") is not None # create the network assert env1_params == env2_params # TODO: make sure to check for equality self.actor_network = actor(env1_params) self.critic_network = critic(env1_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env1_params) self.critic_target_network = critic(env1_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # setup dual critic if applicable self.use_two_critics = self.args.dual_critic if self.use_two_critics: self.critic_network2 = critic(env1_params) sync_networks(self.critic_network2) self.critic_target_network2 = critic(env1_params) self.critic_target_network2.load_state_dict(self.critic_network2.state_dict()) self.critic2_optim = torch.optim.Adam(self.critic_network2.parameters(), lr=self.args.lr_critic) if self.args.cuda: self.critic_network2.cuda() self.critic_target_network2.cuda() # her sampler self.her_module1 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env1.compute_reward) self.her_module2 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env2.compute_reward) # create the replay buffer self.buffer1 = replay_buffer(self.env1_params, self.args.buffer_size, self.her_module1.sample_her_transitions) self.buffer2 = replay_buffer(self.env2_params, self.args.buffer_size, self.her_module2.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env1_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env1_params['goal'], default_clip_range=self.args.clip_range) # create the dict for storing the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env1_name + self.args.env2_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, env, env_params): self.args = args self.env = env self.env_params = env_params # check whether to continue training or start new if args.continue_training is None: self.continueTraining = False else: self.continueTraining = args.continue_training # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # build up the target network dash = "-"*42 if MPI.COMM_WORLD.Get_rank() == 0: print("env.spec.id: ", env.spec.id) print("args: ") d_args = vars(args) print(dash) print("{:<25s}{:<15s}".format("ARGS", "VALUE")) for key in d_args: if d_args[key] is not None: print("|{:<22s} | {:<15}|".format(key, d_args[key])) print(dash) print("env_inits: ") print("{:<25s}{:<15s}".format("ENV_INIT", "VALUE")) for key in env.env.inits: print("|{:<22s} | {:<15}|".format(key, env.env.inits[key])) print(dash) print("env_dimensions: ") for key in env_params: print("|{:<22s} | {:<15}|".format(key, env_params[key])) print(dash) #print("env_params", env_params) if self.continueTraining: if MPI.COMM_WORLD.Get_rank() == 0: print("CONTINUE TRAINING...") env_name = env.spec.id saved_dicts = load_saved_state_dicts( args.save_dir, env_name, MPI.COMM_WORLD.Get_rank()) self.actor_network.load_state_dict(saved_dicts['actor']) self.critic_network.load_state_dict(saved_dicts['critic']) self.critic_target_network.load_state_dict( saved_dicts['critic_target']) self.actor_target_network.load_state_dict( saved_dicts['actor_target']) else: # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.actor_target_network.cuda() self.critic_target_network.cuda() # create the optimizer self.actor_optim = torch.optim.Adam( self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam( self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler( self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer( self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer( size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer( size=env_params['goal'], default_clip_range=self.args.clip_range) # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join( self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path)
def __init__(self, args, compute_rew, goal_sampler): self.args = args self.alpha = args.alpha self.env_params = args.env_params self.goal_sampler = goal_sampler self.total_iter = 0 self.freq_target_update = args.freq_target_update # create the network self.architecture = self.args.architecture if self.architecture == 'flat': self.actor_network = GaussianPolicyFlat(self.env_params) self.critic_network = QNetworkFlat(self.env_params) # sync the networks across the CPUs sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.critic_target_network = QNetworkFlat(self.env_params) hard_update(self.critic_target_network, self.critic_network) sync_networks(self.critic_target_network) # create the optimizer self.policy_optim = torch.optim.Adam( self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam( self.critic_network.parameters(), lr=self.args.lr_critic) elif self.architecture == 'deepsets': if args.algo == 'language': from rl_modules.language_models import DeepSetLanguage self.model = DeepSetLanguage(self.env_params, args) elif args.algo == 'continuous': from rl_modules.continuous_models import DeepSetContinuous self.model = DeepSetContinuous(self.env_params, args) else: from rl_modules.semantic_models import DeepSetSemantic self.model = DeepSetSemantic(self.env_params, args) # sync the networks across the CPUs sync_networks(self.model.critic) sync_networks(self.model.actor) hard_update(self.model.critic_target, self.model.critic) sync_networks(self.model.critic_target) # create the optimizer self.policy_optim = torch.optim.Adam(list( self.model.actor.parameters()), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(list( self.model.critic.parameters()), lr=self.args.lr_critic) else: raise NotImplementedError # create the normalizer self.o_norm = normalizer(size=self.env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=self.env_params['goal'], default_clip_range=self.args.clip_range) # if use GPU if self.args.cuda: self.actor_network.cuda() self.critic_network.cuda() self.critic_target_network.cuda() # Target Entropy if self.args.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env_params['action'])).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.lr_entropy) # her sampler if args.algo == 'continuous': self.continuous_goals = True else: self.continuous_goals = False if args.algo == 'language': self.language = True else: self.language = False self.her_module = her_sampler(self.args, compute_rew) # create the replay buffer self.buffer = MultiBuffer( env_params=self.env_params, buffer_size=self.args.buffer_size, sample_func=self.her_module.sample_her_transitions, multi_head=self.args.multihead_buffer if not self.language else False, goal_sampler=self.goal_sampler)
def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network) # auto temperature if self.args.alpha < 0.0: # if self.args.alpha < 0.0, # sac will use auto temperature and init alpha = - self.args.alpha self.alpha = -self.args.alpha self.log_alpha = torch.tensor(np.log(self.alpha), dtype=torch.float32, device=device, requires_grad=True) self.target_entropy = -np.prod(env.action_space.shape).astype( np.float32) self.target_entropy = self.target_entropy / 2.0 self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.lr_actor) else: self.alpha = self.args.alpha self.alpha = torch.tensor(self.alpha)
def __init__(self, args, env, env_params, writer=None): if args.cuda: torch.cuda.set_device(args.device) self.args = args self.env = env env_params['action'] = env_params['action'] // 2 env_params['obs'] = 37 # original 56 self.env_params = env_params # create the network self.actor_network_1 = actor_prob(env_params) self.critic_network_1 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network_1) sync_networks(self.critic_network_1) # build up the target network self.actor_target_network_1 = actor_prob(env_params) self.critic_target_network_1 = critic(env_params) # load the weights into the target networks self.actor_target_network_1.load_state_dict( self.actor_network_1.state_dict()) self.critic_target_network_1.load_state_dict( self.critic_network_1.state_dict()) # create the network self.actor_network_2 = actor_prob(env_params) self.critic_network_2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network_2) sync_networks(self.critic_network_2) # build up the target network self.actor_target_network_2 = actor_prob(env_params) self.critic_target_network_2 = critic(env_params) # load the weights into the target networks self.actor_target_network_2.load_state_dict( self.actor_network_2.state_dict()) self.critic_target_network_2.load_state_dict( self.critic_network_2.state_dict()) # if use gpu if self.args.cuda: self.actor_network_1.to(args.device) self.critic_network_1.to(args.device) self.actor_target_network_1.to(args.device) self.critic_target_network_1.to(args.device) self.actor_network_2.to(args.device) self.critic_network_2.to(args.device) self.actor_target_network_2.to(args.device) self.critic_target_network_2.to(args.device) # create the optimizer self.actor_optim_1 = torch.optim.Adam( self.actor_network_1.parameters(), lr=self.args.lr_actor) self.critic_optim_1 = torch.optim.Adam( self.critic_network_1.parameters(), lr=self.args.lr_critic) self.actor_optim_2 = torch.optim.Adam( self.actor_network_2.parameters(), lr=self.args.lr_actor_2) self.critic_optim_2 = torch.optim.Adam( self.critic_network_2.parameters(), lr=self.args.lr_critic_2) # her sampler if 'Stack' in self.args.env_name or 'Lift' in self.args.env_name: self.her_module_1 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward, stack=True) self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward, stack=True) else: self.her_module_1 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # if args.only_for_door: # self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k, # self.env.compute_reward_only_for_door, double=True) # else: # self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k, # self.env.compute_reward_for_door, double=True) # create the replay buffer self.buffer_1 = replay_buffer(self.env_params, self.args.buffer_size, self.her_module_1.sample_her_transitions) self.buffer_2 = replay_buffer(self.env_params, self.args.buffer_size, self.her_module_2.sample_her_transitions) # self.buffer_2 = replay_buffer(self.env_params, self.args.buffer_size, self.her_module_2.sample_her_transitions, # double=True) # create the normalizer self.o_norm_1 = normalizer( size=37, default_clip_range=self.args.clip_range) # original 56 self.o_norm_2 = normalizer( size=37, default_clip_range=self.args.clip_range) # original 56 self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) # for visualization self.g_norm_1 = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.g_norm_2 = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.explore = None if args.count_exp: self.explore = HashingBonusEvaluator( obs_processed_flat_dim=37, beta=args.exp_beta) # original 56 # create the dict for store the model if MPI.COMM_WORLD.Get_rank() == 0: if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) if args.count_exp: self.model_path = os.path.join(self.model_path, 'count-exploration') if not os.path.exists(self.model_path): os.mkdir(self.model_path) # flag = 'actor_lr_' + str(self.args.lr_actor) + '_critic_lr_' + str( # self.args.lr_critic) + '_actor_lr_2_' + str(self.args.lr_actor_2) + '_critic_lr_2_' + str( # self.args.lr_critic_2) # flag += '_shaped_reward' # if args.only_for_door: # flag += '_only_for_door' # else: # flag += '_not_only_for_door' flag = str(self.args.env_name) self.model_path = os.path.join(self.model_path, flag) if not os.path.exists(self.model_path): os.mkdir(self.model_path) if not os.path.exists(self.args.save_training_success_rate_dir): os.mkdir(self.args.save_training_success_rate_dir) self.training_success_rate_path = os.path.join( self.args.save_training_success_rate_dir, self.args.env_name) if not os.path.exists(self.training_success_rate_path): os.mkdir(self.training_success_rate_path) self.training_success_rate_path = os.path.join( self.training_success_rate_path, flag) if not os.path.exists(self.training_success_rate_path): os.mkdir(self.training_success_rate_path) if not os.path.exists(self.args.save_training_return_dir): os.mkdir(self.args.save_training_return_dir) self.training_return_path = os.path.join( self.args.save_training_return_dir, self.args.env_name) if not os.path.exists(self.training_return_path): os.mkdir(self.training_return_path) self.training_return_path = os.path.join(self.training_return_path, flag) if not os.path.exists(self.training_return_path): os.mkdir(self.training_return_path) self.writer = writer # for sac, one more critic for each self.critic_network_1_2 = critic(env_params) self.critic_network_2_2 = critic(env_params) sync_networks(self.critic_network_1_2) sync_networks(self.critic_network_2_2) self.critic_target_network_1_2 = critic(env_params) self.critic_target_network_2_2 = critic(env_params) # load the weights into the target networks self.critic_target_network_1_2.load_state_dict( self.critic_network_1_2.state_dict()) self.critic_target_network_2_2.load_state_dict( self.critic_network_2_2.state_dict()) # if use gpu if self.args.cuda: self.critic_network_1_2.to(args.device) self.critic_target_network_1_2.to(args.device) self.critic_network_2_2.to(args.device) self.critic_target_network_2_2.to(args.device) # create the optimizer self.critic_optim_1_2 = torch.optim.Adam( self.critic_network_1_2.parameters(), lr=self.args.lr_critic) self.critic_optim_2_2 = torch.optim.Adam( self.critic_network_2_2.parameters(), lr=self.args.lr_critic_2) self.alpha_1 = args.alpha # 0.2 self.alpha_2 = args.alpha # 0.2 single_action_space = (env.action_space.shape[0] // 2, ) if self.args.cuda: self.target_entropy_1 = -torch.prod( torch.FloatTensor(single_action_space).to( self.args.device)).item() if self.args.cuda: self.log_alpha_1 = torch.zeros(1, requires_grad=True, device=self.args.device) self.alpha_optim_1 = torch.optim.Adam( [self.log_alpha_1], lr=args.alpha_lr) # two agent share same alpha_lr now if self.args.cuda: self.target_entropy_2 = -torch.prod( torch.FloatTensor(single_action_space).to( self.args.device)).item() if self.args.cuda: self.log_alpha_2 = torch.zeros(1, requires_grad=True, device=self.args.device) self.alpha_optim_2 = torch.optim.Adam( [self.log_alpha_2], lr=args.alpha_lr) # two agent share same alpha_lr now