class fit_dist(object): def __init__(self, num_inputs, action_space, args): self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def train(self, memory, batch_size): # Sample replay buffer / batch # state_np, action_np, reward_np, next_state_np, mask_np = memory.sample(batch_size=batch_size) state_np, next_state_np, action_np, reward_np, mask_np = memory.sample(batch_size=batch_size) state_batch = torch.FloatTensor(state_np).to(device) action_batch = torch.FloatTensor(action_np).to(device) log_prob = self.policy.lod_prob(state_batch, action_batch) loss = -log_prob.mean() self.policy_optim.zero_grad() loss.backward() self.policy_optim.step() return loss.item() # Save model parameters def save_model(self, buffer_type, env_name, suffix="", actor_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/{}_SAC_prior_{}_{}".format(buffer_type, env_name, suffix) print('Saving models to {}'.format(actor_path)) torch.save(self.policy.state_dict(), actor_path)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) self.critic_optim = Adam(self.critic.parameters(), weight_decay=1e-2) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=1e-4) self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) hard_update(self.policy_target, self.policy) self.dual_lambda = args.init_dual_lambda self.dual_step_size = args.dual_step_size self.cost_epsilon = args.cost_epsilon self.coefficient_weight = args.coefficient_weight self.dual_steps = args.dual_steps self.dirac_policy_num = args.dirac_policy_num self.m = args.m self.n = args.n self.mmd_before_tanh = args.mmd_before_tanh
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device) hard_update(self.critic_target, self.critic) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device) hard_update(self.policy_target, self.policy) # dual_lambda self.dual_lambda = args.init_dual_lambda self.dual_step_size = args.dual_step_size self.cost_epsilon = args.cost_epsilon # coefficient_weight assigned to ensemble variance term self.coefficient_weight = args.coefficient_weight self.dual_grad_times = args.dual_grad_times
def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_boundary = env_params['action_boundary'] self.max_episode_steps = env_params['max_episode_steps'] self.evaluate_episodes = args.evaluate_episodes self.lr_pi = args.lr_pi self.lr_v = args.lr_v self.gamma = args.gamma self.lamda = args.lamda self.action_var = args.action_var self.clip_range = args.clip_range self.temperature_coef = args.temperature_coef self.K_updates = args.K_updates self.device = torch.device(args.device) self.load_model_remark = args.load_model_remark self.total_trained_goal_num = 0 self.total_episode_num = 0 self.total_update_num = 0 self.buffer = TrajectoryBuffer(self.max_episode_steps, self.o_dim, self.g_dim, self.a_dim) self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.policy_old = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.V = VFunction(self.o_dim, self.g_dim).to(self.device) self.V_old = VFunction(self.o_dim, self.g_dim).to(self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_v = optim.Adam(self.V.parameters(), lr=self.lr_v) self.hard_update()
def __init__(self): self.gamma = 0.99 self.tau = 0.005 self.alpha = 0.2 self.lr = 0.003 self.target_update_interval = 1 self.device = torch.device("cpu") # 8 phases self.num_inputs = 8 self.num_actions = 1 self.hidden_size = 256 self.critic = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Copy the parameters of critic to critic_target self.target_entropy = -torch.Tensor([1.0]).to(self.device).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr)
def __init__(self, num_inputs, action_space, args): self.device = torch.device("cpu") self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.genbuffer_algo = args.genbuffer_algo
def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.action_scale = np.array(env_params['action_scale'], dtype=np.float32) self.action_bias = np.array(env_params['action_bias'], dtype=np.float32) self.action_boundary = env_params['action_boundary'] self.device = torch.device(args.device) self.lr = args.lr self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.memory_size = args.memory_size self.batch_size = args.batch_size self.target_update_interval = args.target_update_interval self.memory = Memory(self.memory_size, self.o_dim, self.a_dim) self.policy = GaussianPolicy(self.o_dim, self.a_dim).to(self.device) self.critic = TwinQFunction(self.o_dim, self.a_dim).to(self.device) self.critic_target = TwinQFunction(self.o_dim, self.a_dim).to(self.device) self.target_entropy = -torch.prod( torch.Tensor(self.a_dim).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.critic.parameters(), lr=self.lr) self.optimizer_alpha = optim.Adam([self.log_alpha], lr=self.lr) self.hard_update_target()
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.scale_R = args.scale_R self.reparam = args.reparam self.deterministic = args.deterministic self.target_update_interval = args.target_update_interval self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.deterministic == False: self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) self.soft_q_criterion = nn.MSELoss()
def __init__(self, num_inputs, action_space, \ device, hidden_size, seed, lr, gamma, tau, alpha): self.gamma = gamma self.tau = tau self.alpha = alpha self.device = device self.seed = seed self.seed = torch.manual_seed(seed) torch.cuda.manual_seed(seed) #torch.cuda.manual_seed_all(seed) #torch.backends.cudnn.deterministic=True self.critic = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=lr) self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=lr) self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \ hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=lr)
def __init__(self): #Creating environment self.env = gym.make(settings.env_name) self.env.seed(settings.seed) self.env.action_space.seed(settings.seed) self.state_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.obs_normalizer = Normalizer(self.state_space) self.device = torch.device(settings.device) self.writer = SummaryWriter( 'runs/' + settings.env_name + "_" + settings.algo + '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed)) #Initializing common networks and their optimizers self.exploitory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploitory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_policy_optim = Adam( self.exploitory_policy.parameters(), lr=p.lr) self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr) self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0) p.alpha = torch.Tensor([p.alpha]).to(self.device) if settings.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.automatic_ex_entropy_tuning: self.ex_target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.ex_log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.reward_model == 'novelty': self.ex_reward_model = Novelty(self.state_space, self.device)
def policy_factory(model_path, env): agent = GaussianPolicy(env.observation_space.shape[0], env.action_space.shape[0], 256, env.action_space).to('cpu') agent.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) def policy(obs): state = torch.FloatTensor(obs).unsqueeze(0) _, _, action = agent.sample(state) return action.detach().cpu().numpy()[0] return policy
def main(): # create config config = Config() config.game = args.game config.algo = 'ppo' config.max_steps = int(2e6) config.num_envs = 1 config.optimizer = 'RMSprop' config.lr = 0.0003 config.discount = 0.99 config.use_gae = True config.gae_lambda = 0.95 config.use_grad_clip = True config.max_grad_norm = 0.5 config.rollout_length = 2048 config.value_loss_coef = 0.5 config.entropy_coef = 0 config.ppo_epoch = 10 config.ppo_clip_param = 0.2 config.num_mini_batch = 32 config.use_gpu = True config.seed = args.seed config.num_frame_stack = 1 config.after_set() print(config) # prepare env, model and logger env = make_vec_envs(config.game, num_envs = config.num_envs, seed = config.seed, num_frame_stack= config.num_frame_stack) model = GaussianPolicy(env.observation_space.shape[0], action_dim = get_action_dim(env.action_space)).to(config.device) logger = Logger(SummaryWriter(config.save_path), config.num_echo_episodes) # create agent and run agent = PPOAgent(config, env, model, logger) agent.run()
def __init__(self, state_shape, n_actions, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.action_range = [0.0, 1.0] self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.critic = QNetwork(state_shape, n_actions, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(state_shape, n_actions, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning: self.target_entropy = -torch.prod(torch.Tensor(n_actions).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(state_shape, n_actions, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(state_shape, n_actions, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0') parser.add_argument('--log_name', type=str, default='') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() if args.log_name: log_dir = os.path.join('logs', args.env_id, args.log_name) else: env_dir = os.path.join('logs', args.env_id, '*') dirs = glob.glob(env_dir) log_dir = max(dirs, key=os.path.getctime) print(f'using {log_dir}') env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) env.render() while True: state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state print(f'total reward: {episode_reward}') time.sleep(1)
def __init__(self, id, o_dim, a_dim, action_bound, device, eval_episode): self.id = id self.fitness = 0. self.eval_episode = eval_episode self.device = device self.action_bound = action_bound #self.actor = DeterministicPolicy(o_dim, a_dim).to(device) self.actor = GaussianPolicy(o_dim, a_dim).to(self.device)
def __init__(self): super(Off_policy, self).__init__() self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size) self.exploratory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploratory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_policy_optim = Adam( self.exploratory_policy.parameters(), lr=p.lr) self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(), lr=p.lr) self.target_update(self.exploratory_policy, self.exploitory_policy, 1.0) self.kl_normalizer = Normalizer(1) self.ex_rewards_normalizer = Normalizer(1)
def __init__(self, num_inputs, action_space, agent_args): self.gamma = agent_args["gamma"] self.tau = agent_args["tau"] self.alpha = agent_args["alpha"] self.policy_type = agent_args["policy"] self.target_update_interval = agent_args["target_update_interval"] self.automatic_entropy_tuning = agent_args["automatic_entropy_tuning"] self.device = torch.device("cuda" if agent_args["cuda"] else "cpu") # print("num_inputs::",num_inputs) # print("type(action_space)::",type(action_space)) # print("type(action_space)::",isinstance(action_space,gym.spaces.discrete.Discrete)) # print(" agent_args['hidden_size']::", agent_args["hidden_size"]) if isinstance(action_space, gym.spaces.discrete.Discrete): action_shape = action_space.n else: action_shape = action_space.shape[0] self.critic = QNetwork( num_inputs, action_shape, agent_args["hidden_size"]).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=agent_args["lr"]) self.critic_target = QNetwork(num_inputs, action_shape, agent_args["hidden_size"]).to( self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=agent_args["lr"]) self.policy = GaussianPolicy(num_inputs, action_shape, agent_args["hidden_size"], action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=agent_args["lr"]) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_shape, agent_args["hidden_size"], action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=agent_args["lr"])
def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.g_dim = env_params['g_dim'] self.action_boundary = env_params['action_boundary'] self.max_episode_steps = env_params['max_episode_steps'] self.evaluate_episodes = args.evaluate_episodes self.lr_pi = args.lr_pi_TD3 self.lr_q = args.lr_q self.gamma = args.gamma self.tau = args.tau self.action_var = args.action_var self.noise_std = args.noise_std self.noise_clip = args.noise_clip self.K_updates = args.K_updates_TD3 self.policy_update_interval = args.policy_update_interval self.batch_size = args.batch_size self.device = torch.device(args.device) self.load_model_remark = args.load_model_remark self.total_trained_goal_num = 0 self.total_episode_num = 0 self.total_update_num = 0 self.policy_loss_log = 0. self.q1_loss_log = 0. self.q2_loss_log = 0. self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim) self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device) self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi) self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q) self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q) self.hard_update()
def testing(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='HalfCheetah-v2') parser.add_argument('--num_episode', type=int, default=10) args = parser.parse_args() num_episode = args.num_episode env = gym.make(args.env_name) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") policy = GaussianPolicy( env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join('models', args.env_name, 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) e_rewrads = [] for _ in range(num_episode): state = env.reset() episode_reward = 0. done = False while not done: if num_episode <= 1: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state e_rewrads.append(episode_reward) print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads))) print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
def __init__(self, input_space, action_space, args): self.use_expert = args.use_expert self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.action_range = [action_space.low, action_space.high] self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning # self.device = torch.device("cuda" if args.cuda else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print(torch.cuda.is_available()) # print(torch.cuda.current_device()) # print(torch.cuda.device(0)) # print(torch.cuda.device_count()) # print(torch.cuda.get_device_name()) # print(torch.backends.cudnn.version()) # print(torch.backends.cudnn.is_available()) self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: raise ValueError("Not supper another type yet.")
def __init__(self, action_size, state_size, config): self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] torch.manual_seed(self.seed) np.random.seed(self.seed) self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) #self.policy = SACActor(state_size, action_size).to(self.device) self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.max_timesteps = config["max_episodes_steps"] self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps= 0 self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
def __init__(self, num_inputs, action_space, args): #self.n_flow = args.n_flows #assert self.n_flow == 0 self.num_inputs = num_inputs #self.flow_family = args.flow_family self.num_layers = args.num_layers self.args = args self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, self.num_layers, args).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def run(): parser = argparse.ArgumentParser() parser.add_argument('--env_id', type=str, default='HalfCheetah-v2') parser.add_argument('--log_name', type=str, default='sac-seed0-datetime') parser.add_argument('--cuda', action='store_true') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() log_dir = os.path.join('logs', args.env_id, args.log_name) env = gym.make(args.env_id) device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") policy = GaussianPolicy(env.observation_space.shape[0], env.action_space.shape[0], hidden_units=[256, 256]).to(device) policy.load(os.path.join(log_dir, 'model', 'policy.pth')) grad_false(policy) def exploit(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) with torch.no_grad(): _, _, action = policy.sample(state) return action.cpu().numpy().reshape(-1) state = env.reset() episode_reward = 0. done = False while not done: env.render() action = exploit(state) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state
def __init__(self, num_inputs, action_space, config): self.gamma = config['gamma'] self.tau = config['tau'] self.alpha = config['alpha'] self.policy_type = config['policy'] self.target_update_interval = config['target_update_interval'] self.automatic_entropy_tuning = config['automatic_entropy_tuning'] self.device = torch.device( 'cuda:' + str(config['cuda'])) if torch.cuda.is_available( ) and config['cuda'] >= 0 else torch.device('cpu') self.critic = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=config['lr']) self.critic_target = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=config['lr']) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], config['hidden_size'], action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config['lr'])
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") # Q network, which yields a certain value for (a_t | s_t) pair self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) # a sort of a replica - since, due to Bellman recursive definition, Q network learns from itself- and its unstbale self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) # the start point is same weights in both networks. hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # todo: crunch on this automatic alpha update # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) # instanciating of policy - given a state it produces probabilities for actions self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False # todo: what's difference between deterministic to Gaussian self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma #γ self.tau = args.tau #τ self.alpha = args.alpha #α self.policy_type = args.policy #策略类型,高斯随机策略、确定性策略 self.target_update_interval = args.target_update_interval #target network更新间隔 self.automatic_entropy_tuning = args.automatic_entropy_tuning #自动调熵 self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to( device=self.device) #Critic Network,Q网络 self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to( self.device) #Target Q Network hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to( self.device)).item() #torch.prod(input) : 返回所有元素的乘积 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") #Similar to Double-QNetwork self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) #The two networks are with the same initialization #Two option policy, stochastic(Gaussian) or Deterministic if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic)
def __init__(self, num_inputs, action_space, variant): self.gamma = variant['gamma'] self.tau = variant['tau'] self.alpha = variant['alpha'] self.policy_type = variant['policy_type'] self.target_update_interval = variant['target_update_interval'] self.automatic_entropy_tuning = variant['automatic_entropy_tuning'] self.lr = variant.get("lr", 1e-3) self.device = torch.device("cuda" if variant['cuda'] else "cpu") self.hidden_size = variant.get('hidden_size', [128, 128]) self.critic = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == 'Gaussian': if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)