Esempio n. 1
0
    def __init__(self, args, env, env_params):
        self.args = args

        # path to save the model
        self.exp_name = '_'.join((self.args.env_name, self.args.alg, 
                    str(self.args.seed), datetime.now().isoformat()))
        self.data_path = os.path.join(self.args.save_dir, 
                '_'.join((self.args.env_name, self.args.alg)),
                self.exp_name)
        self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name)
        self.logger.save_config(args)

        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())

        # if use gpu
        self.rank = MPI.COMM_WORLD.Get_rank()
        if args.cuda:
            device = 'cuda:{}'.format(self.rank % torch.cuda.device_count())
        else:
            device = 'cpu'
        self.device = torch.device(device)

        if self.args.cuda:
            self.actor_network.cuda(self.device)
            self.critic_network.cuda(self.device)
            self.actor_target_network.cuda(self.device)
            self.critic_target_network.cuda(self.device)
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)

        self.logger.setup_pytorch_saver(self.actor_network)
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        if self.args.optimizer_type =='SGD':
            self.actor_optim = torch.optim.SGD(self.actor_network.parameters(), lr=self.args.lr_actor)
            self.critic_optim = torch.optim.SGD(self.critic_network.parameters(), lr=self.args.lr_critic)
        elif self.args.optimizer_type =='adam':
            self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
            self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        self.scales = []

        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)

            self.result_dir = f'./learning_curves/{args.env_name}/{self.args.run_name}'
            if not os.path.isdir(self.result_dir):
                os.makedirs(self.result_dir, exist_ok=True)
                print(f'creating {self.result_dir}')
            self.writer = SummaryWriter(logdir=self.result_dir)
Esempio n. 3
0
 def __init__(self, args, env, env_params):
     self.args = args
     self.env = env
     self.env_params = env_params
     # create the network
     self.actor_network = actor(env_params)
     self.critic_network = critic(env_params)
     # sync the networks across the cpus
     sync_networks(self.actor_network)
     sync_networks(self.critic_network)
     # build up the target network
     self.actor_target_network = actor(env_params)
     self.critic_target_network = critic(env_params)
     # load the weights into the target networks
     self.actor_target_network.load_state_dict(
         self.actor_network.state_dict())
     self.critic_target_network.load_state_dict(
         self.critic_network.state_dict())
     # if use gpu
     if self.args.cuda:
         self.actor_network.cuda()
         self.critic_network.cuda()
         self.actor_target_network.cuda()
         self.critic_target_network.cuda()
     # create the optimizer
     self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                         lr=self.args.lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                          lr=self.args.lr_critic)
     # her sampler
     self.her_module = her_sampler(self.args.replay_strategy,
                                   self.args.replay_k,
                                   self.env.compute_reward)
     # create the replay buffer
     self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                 self.her_module.sample_her_transitions)
     # create the normalizer
     self.o_norm = normalizer(size=env_params['obs'],
                              default_clip_range=self.args.clip_range)
     self.g_norm = normalizer(size=env_params['goal'],
                              default_clip_range=self.args.clip_range)
     # create the dict for store the model
     if MPI.COMM_WORLD.Get_rank() == 0:
         if not os.path.exists(self.args.save_dir):
             os.mkdir(self.args.save_dir)
         # path to save the model
         self.model_path = os.path.join(self.args.save_dir,
                                        self.args.env_name)
         if not os.path.exists(self.model_path):
             os.mkdir(self.model_path)
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(args, env_params)
        self.critic_network = critic(args, env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(args, env_params)
        self.critic_target_network = critic(args, env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)

            # added
            log_dir = create_env_folder(args.env_name, args.network_class, test=args.test)
            save_kwargs(vars(args), log_dir)
            tabular_log_path = osp.join(log_dir, 'progress.csv')
            text_log_path = osp.join(log_dir, 'debug.log')
            logger.add_text_output(text_log_path)
            logger.add_tabular_output(tabular_log_path)
            exp_name = f'{args.env_name}'
            logger.push_prefix("[%s] " % exp_name)
Esempio n. 5
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        sim = self.env.sim
        self.viewer = MjRenderContextOffscreen(sim)
        # self.viewer.cam.fixedcamid = 3
        # self.viewer.cam.type = const.CAMERA_FIXED
        self.critic_loss = []
        self.actor_loss = []
        self.viewer.cam.distance = 1.2
        self.viewer.cam.azimuth = 180
        self.viewer.cam.elevation = -25
        env.env._viewers['rgb_array'] = self.viewer

        self.env_params = env_params
        self.image_based = True if args.image else False
        print("Training image based RL ? : {}".format(self.image_based))
        # create the network
        if not self.image_based:
            self.actor_network = actor(env_params)
        else:
            self.actor_network = new_actor(env_params)
            #self.actor_network = resnet_actor(env_params)
        self.critic_network = critic(env_params)

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        if not self.image_based:
            self.actor_target_network = actor(env_params)
        else:
            #self.actor_target_network = resnet_actor(env_params)
            self.actor_target_network = new_actor(env_params)

        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            print("use the GPU")
            self.actor_network.cuda(MPI.COMM_WORLD.Get_rank())
            self.critic_network.cuda(MPI.COMM_WORLD.Get_rank())
            self.actor_target_network.cuda(MPI.COMM_WORLD.Get_rank())
            self.critic_target_network.cuda(MPI.COMM_WORLD.Get_rank())

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward,
                                      self.image_based)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions,
                                    self.image_based)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
Esempio n. 6
0
    def __init__(self, args, env1, env2, env1_params, env2_params):
        self.args = args
        self.env1 = env1
        self.env2 = env2
        self.env1_params = env1_params
        self.env2_params = env2_params
        if not self.args.dont_inject_observation:
            self.env2_params['obs'] += 1
            self.env1_params['obs'] += 1

        self.train_mode = TrainMode(args.training_mode)

        # store weights and biases API key if in args
        if self.args.wandb_api_key is not None:
            os.environ["WANDB_API_KEY"] = self.args.wandb_api_key
        # if key is present set a flag to enable the functionality
        self.use_wandb_log = os.environ.get("WANDB_API_KEY") is not None

        # create the network
        assert env1_params == env2_params  # TODO: make sure to check for equality
        self.actor_network = actor(env1_params)

        self.critic_network = critic(env1_params)

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # build up the target network
        self.actor_target_network = actor(env1_params)
        self.critic_target_network = critic(env1_params)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        
        # setup dual critic if applicable
        self.use_two_critics = self.args.dual_critic
        if self.use_two_critics:
            self.critic_network2 = critic(env1_params)
            sync_networks(self.critic_network2)
            self.critic_target_network2 = critic(env1_params)
            self.critic_target_network2.load_state_dict(self.critic_network2.state_dict())
            self.critic2_optim = torch.optim.Adam(self.critic_network2.parameters(), lr=self.args.lr_critic)
            
            if self.args.cuda:
                self.critic_network2.cuda()
                self.critic_target_network2.cuda()

        # her sampler
        self.her_module1 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env1.compute_reward)
        self.her_module2 = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env2.compute_reward)

        # create the replay buffer
        self.buffer1 = replay_buffer(self.env1_params, self.args.buffer_size, self.her_module1.sample_her_transitions)
        self.buffer2 = replay_buffer(self.env2_params, self.args.buffer_size, self.her_module2.sample_her_transitions)

        # create the normalizer
        self.o_norm = normalizer(size=env1_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env1_params['goal'], default_clip_range=self.args.clip_range)

        # create the dict for storing the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)

            # path to save the model
            self.model_path = os.path.join(self.args.save_dir, self.args.env1_name + self.args.env2_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
Esempio n. 7
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # check whether to continue training or start new
        if args.continue_training is None:
            self.continueTraining = False
        else:
            self.continueTraining = args.continue_training
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # build up the target network
        dash = "-"*42
        if MPI.COMM_WORLD.Get_rank() == 0:
            print("env.spec.id: ", env.spec.id)
            print("args: ")
            d_args = vars(args)
            print(dash)
            print("{:<25s}{:<15s}".format("ARGS", "VALUE"))
            for key in d_args:
                if d_args[key] is not None:
                    print("|{:<22s} | {:<15}|".format(key, d_args[key]))
            print(dash)
            print("env_inits: ")
            print("{:<25s}{:<15s}".format("ENV_INIT", "VALUE"))
            for key in env.env.inits:
                print("|{:<22s} | {:<15}|".format(key, env.env.inits[key]))
            print(dash)
            print("env_dimensions: ")
            for key in env_params:
                print("|{:<22s} | {:<15}|".format(key, env_params[key]))
            print(dash)

            #print("env_params", env_params)
        if self.continueTraining:
            if MPI.COMM_WORLD.Get_rank() == 0:
                print("CONTINUE TRAINING...")
            env_name = env.spec.id
            saved_dicts = load_saved_state_dicts(
                args.save_dir, env_name, MPI.COMM_WORLD.Get_rank())
            self.actor_network.load_state_dict(saved_dicts['actor'])
            self.critic_network.load_state_dict(saved_dicts['critic'])

            self.critic_target_network.load_state_dict(
                saved_dicts['critic_target'])
            self.actor_target_network.load_state_dict(
                saved_dicts['actor_target'])
        else:

            # load the weights into the target networks
            self.actor_target_network.load_state_dict(
                self.actor_network.state_dict())
            self.critic_target_network.load_state_dict(
                self.critic_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(
            self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(
            self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(
            self.args.replay_strategy, self.args.replay_k, self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(
            self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(
            size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(
            size=env_params['goal'], default_clip_range=self.args.clip_range)
        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(
                self.args.save_dir, self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
Esempio n. 8
0
    def __init__(self, args, env, env_params):
        self.args = args

        # path to save the model
        self.exp_name = '_'.join(
            (self.args.env_name, self.args.alg, str(self.args.seed),
             datetime.now().isoformat()))
        self.data_path = os.path.join(
            self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)),
            self.exp_name)
        self.logger = EpochLogger(output_dir=self.data_path,
                                  exp_name=self.exp_name)
        self.logger.save_config(args)

        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network1 = critic(env_params)
        self.critic_network2 = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network1)
        sync_networks(self.critic_network2)
        # build up the target network
        # self.actor_target_network = actor(env_params)
        self.critic_target_network1 = critic(env_params)
        self.critic_target_network2 = critic(env_params)
        # load the weights into the target networks
        # self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network1.load_state_dict(
            self.critic_network1.state_dict())
        self.critic_target_network2.load_state_dict(
            self.critic_network2.state_dict())

        # if use gpu
        self.rank = MPI.COMM_WORLD.Get_rank()
        if args.cuda:
            device = 'cuda:{}'.format(self.rank % torch.cuda.device_count())
        self.device = torch.device(device)

        if self.args.cuda:
            self.actor_network.cuda(self.device)
            self.critic_network1.cuda(self.device)
            self.critic_network2.cuda(self.device)
            # self.actor_target_network.cuda(self.device)
            self.critic_target_network1.cuda(self.device)
            self.critic_target_network2.cuda(self.device)
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim1 = torch.optim.Adam(
            self.critic_network1.parameters(), lr=self.args.lr_critic)
        self.critic_optim2 = torch.optim.Adam(
            self.critic_network2.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)

        self.logger.setup_pytorch_saver(self.actor_network)

        # auto temperature
        if self.args.alpha < 0.0:
            # if self.args.alpha < 0.0,
            # sac will use auto temperature and init alpha = - self.args.alpha
            self.alpha = -self.args.alpha
            self.log_alpha = torch.tensor(np.log(self.alpha),
                                          dtype=torch.float32,
                                          device=device,
                                          requires_grad=True)
            self.target_entropy = -np.prod(env.action_space.shape).astype(
                np.float32)
            self.target_entropy = self.target_entropy / 2.0
            self.alpha_optim = torch.optim.Adam([self.log_alpha],
                                                lr=self.args.lr_actor)
        else:
            self.alpha = self.args.alpha
        self.alpha = torch.tensor(self.alpha)
    def __init__(self, args, env, env_params, writer=None):
        if args.cuda:
            torch.cuda.set_device(args.device)
        self.args = args
        self.env = env
        env_params['action'] = env_params['action'] // 2
        env_params['obs'] = 37  # original 56
        self.env_params = env_params
        # create the network
        self.actor_network_1 = actor_prob(env_params)
        self.critic_network_1 = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network_1)
        sync_networks(self.critic_network_1)
        # build up the target network
        self.actor_target_network_1 = actor_prob(env_params)
        self.critic_target_network_1 = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network_1.load_state_dict(
            self.actor_network_1.state_dict())
        self.critic_target_network_1.load_state_dict(
            self.critic_network_1.state_dict())

        # create the network
        self.actor_network_2 = actor_prob(env_params)
        self.critic_network_2 = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network_2)
        sync_networks(self.critic_network_2)
        # build up the target network
        self.actor_target_network_2 = actor_prob(env_params)
        self.critic_target_network_2 = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network_2.load_state_dict(
            self.actor_network_2.state_dict())
        self.critic_target_network_2.load_state_dict(
            self.critic_network_2.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network_1.to(args.device)
            self.critic_network_1.to(args.device)
            self.actor_target_network_1.to(args.device)
            self.critic_target_network_1.to(args.device)
            self.actor_network_2.to(args.device)
            self.critic_network_2.to(args.device)
            self.actor_target_network_2.to(args.device)
            self.critic_target_network_2.to(args.device)
        # create the optimizer
        self.actor_optim_1 = torch.optim.Adam(
            self.actor_network_1.parameters(), lr=self.args.lr_actor)
        self.critic_optim_1 = torch.optim.Adam(
            self.critic_network_1.parameters(), lr=self.args.lr_critic)
        self.actor_optim_2 = torch.optim.Adam(
            self.actor_network_2.parameters(), lr=self.args.lr_actor_2)
        self.critic_optim_2 = torch.optim.Adam(
            self.critic_network_2.parameters(), lr=self.args.lr_critic_2)
        # her sampler
        if 'Stack' in self.args.env_name or 'Lift' in self.args.env_name:
            self.her_module_1 = her_sampler(self.args.replay_strategy,
                                            self.args.replay_k,
                                            self.env.compute_reward,
                                            stack=True)
            self.her_module_2 = her_sampler(self.args.replay_strategy,
                                            self.args.replay_k,
                                            self.env.compute_reward,
                                            stack=True)
        else:
            self.her_module_1 = her_sampler(self.args.replay_strategy,
                                            self.args.replay_k,
                                            self.env.compute_reward)
            self.her_module_2 = her_sampler(self.args.replay_strategy,
                                            self.args.replay_k,
                                            self.env.compute_reward)

        # if args.only_for_door:
        #     self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k,
        #                                     self.env.compute_reward_only_for_door, double=True)
        # else:
        #     self.her_module_2 = her_sampler(self.args.replay_strategy, self.args.replay_k,
        #                                     self.env.compute_reward_for_door, double=True)

        # create the replay buffer
        self.buffer_1 = replay_buffer(self.env_params, self.args.buffer_size,
                                      self.her_module_1.sample_her_transitions)
        self.buffer_2 = replay_buffer(self.env_params, self.args.buffer_size,
                                      self.her_module_2.sample_her_transitions)
        # self.buffer_2 = replay_buffer(self.env_params, self.args.buffer_size, self.her_module_2.sample_her_transitions,
        #                               double=True)

        # create the normalizer
        self.o_norm_1 = normalizer(
            size=37, default_clip_range=self.args.clip_range)  # original 56
        self.o_norm_2 = normalizer(
            size=37, default_clip_range=self.args.clip_range)  # original 56
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)

        # for visualization
        self.g_norm_1 = normalizer(size=env_params['goal'],
                                   default_clip_range=self.args.clip_range)
        self.g_norm_2 = normalizer(size=env_params['goal'],
                                   default_clip_range=self.args.clip_range)

        self.explore = None
        if args.count_exp:
            self.explore = HashingBonusEvaluator(
                obs_processed_flat_dim=37, beta=args.exp_beta)  # original 56
        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
            if args.count_exp:
                self.model_path = os.path.join(self.model_path,
                                               'count-exploration')
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
            # flag = 'actor_lr_' + str(self.args.lr_actor) + '_critic_lr_' + str(
            #     self.args.lr_critic) + '_actor_lr_2_' + str(self.args.lr_actor_2) + '_critic_lr_2_' + str(
            #     self.args.lr_critic_2)
            # flag += '_shaped_reward'
            # if args.only_for_door:
            #     flag += '_only_for_door'
            # else:
            #     flag += '_not_only_for_door'
            flag = str(self.args.env_name)
            self.model_path = os.path.join(self.model_path, flag)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)

            if not os.path.exists(self.args.save_training_success_rate_dir):
                os.mkdir(self.args.save_training_success_rate_dir)
            self.training_success_rate_path = os.path.join(
                self.args.save_training_success_rate_dir, self.args.env_name)
            if not os.path.exists(self.training_success_rate_path):
                os.mkdir(self.training_success_rate_path)
            self.training_success_rate_path = os.path.join(
                self.training_success_rate_path, flag)
            if not os.path.exists(self.training_success_rate_path):
                os.mkdir(self.training_success_rate_path)

            if not os.path.exists(self.args.save_training_return_dir):
                os.mkdir(self.args.save_training_return_dir)
            self.training_return_path = os.path.join(
                self.args.save_training_return_dir, self.args.env_name)
            if not os.path.exists(self.training_return_path):
                os.mkdir(self.training_return_path)
            self.training_return_path = os.path.join(self.training_return_path,
                                                     flag)
            if not os.path.exists(self.training_return_path):
                os.mkdir(self.training_return_path)

        self.writer = writer
        # for sac, one more critic for each
        self.critic_network_1_2 = critic(env_params)
        self.critic_network_2_2 = critic(env_params)
        sync_networks(self.critic_network_1_2)
        sync_networks(self.critic_network_2_2)
        self.critic_target_network_1_2 = critic(env_params)
        self.critic_target_network_2_2 = critic(env_params)
        # load the weights into the target networks
        self.critic_target_network_1_2.load_state_dict(
            self.critic_network_1_2.state_dict())
        self.critic_target_network_2_2.load_state_dict(
            self.critic_network_2_2.state_dict())
        # if use gpu
        if self.args.cuda:
            self.critic_network_1_2.to(args.device)
            self.critic_target_network_1_2.to(args.device)
            self.critic_network_2_2.to(args.device)
            self.critic_target_network_2_2.to(args.device)
        # create the optimizer
        self.critic_optim_1_2 = torch.optim.Adam(
            self.critic_network_1_2.parameters(), lr=self.args.lr_critic)
        self.critic_optim_2_2 = torch.optim.Adam(
            self.critic_network_2_2.parameters(), lr=self.args.lr_critic_2)
        self.alpha_1 = args.alpha  # 0.2
        self.alpha_2 = args.alpha  # 0.2
        single_action_space = (env.action_space.shape[0] // 2, )
        if self.args.cuda:
            self.target_entropy_1 = -torch.prod(
                torch.FloatTensor(single_action_space).to(
                    self.args.device)).item()
        if self.args.cuda:
            self.log_alpha_1 = torch.zeros(1,
                                           requires_grad=True,
                                           device=self.args.device)
            self.alpha_optim_1 = torch.optim.Adam(
                [self.log_alpha_1],
                lr=args.alpha_lr)  # two agent share same alpha_lr now
        if self.args.cuda:
            self.target_entropy_2 = -torch.prod(
                torch.FloatTensor(single_action_space).to(
                    self.args.device)).item()
        if self.args.cuda:
            self.log_alpha_2 = torch.zeros(1,
                                           requires_grad=True,
                                           device=self.args.device)
            self.alpha_optim_2 = torch.optim.Adam(
                [self.log_alpha_2],
                lr=args.alpha_lr)  # two agent share same alpha_lr now