def __init__(self, observation_space, action_space, lr, gamma, discrete=False, project_dim=4, device='cpu'): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.lr = lr self.n_actions = action_space self.discrete = discrete if self.discrete: self.net = Actor(observation_space, action_space, discrete, project_dim) else: self.net = Actor(observation_space, action_space, discrete) self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr) self.device = device self.net.to(self.device) # move network to device
def __init__(self, observation_space, action_space, lr_actor, lr_critic, gamma, device='cpu', discrete=False, project_dim=8): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.n_actions = action_space self.discrete = discrete if self.discrete: self.actor = DiscreteActor(observation_space, action_space, project_dim) self.critic = DiscreteCritic(observation_space, project_dim) else: self.actor = Actor(observation_space, action_space) self.critic = Critic(observation_space) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) self.device = device
class LunarLander(Problem): def __init__(self, continuous=True, seed=0): super(LunarLander, self).__init__() if continuous: self.env = gym.make('LunarLanderContinuous-v2') else: self.env = gym.make('LunarLander-v2') self.env.seed(seed) self.mu = Actor(s_dim=8, a_dim=4, h_dim=20) def score(self, x): self.mu.fill_weights(x) r_tot = 0. done = False s = self.env.reset() while not done: a = self.mu(s) s, r, done, _ = self.env.step(a) r_tot += r return -r_tot def score_vec(self, X): p = Pool(4) return p.map(self.score, [x for x in X]) p.close() p.join() def save(self, x, g): self.mu.fill_weights(x) pickle.dump(self.mu, open('log/lander/models/model_' + str(g), 'wb'))
def __init__(self, state_size, action_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.replay_buffer = deque(maxlen=1000000) #1m self.gamma = 0.95 #0.99 self.batch_size = 128 self.tau = 0.001 self.seed = random.seed(2) self.noise = OUNoise((20, action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0)
def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0)
def __init__(self, state_dim, action_dim, num_shared, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim, num_shared).to(device) self.critic = Critic(state_dim, num_shared).to(device)
def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed)
def __init__(self, actor_size, action_size, critic_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(actor_size, action_size).to(self.device) self.actor_target = Actor(actor_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(critic_size).to(self.device) self.critic_target = Critic(critic_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.gamma = 0.95 #0.99 self.tau = 0.001 self.noise = OUNoise((action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0)
def run_agent(env: Environment, actor: Actor, render: bool, n_eval_episode: int, **kwargs): actor.eval() scores = [] for _ in range(n_eval_episode): score = 0. states = env.reset(render=render) for step in count(): actions = actor.act(states) actions = actions.detach().numpy() __log.debug("Actions: %s." % str(actions)) states, rewards, dones, _ = env.step(actions) score += np.mean(rewards) if any(dones): __log.info("Done.") break if 'max_step' in kwargs and step >= kwargs['max_step']: __log.info("Break due to hit max_step.") break __log.info("Score: {}".format(score)) scores.append(score) __log.info("Average: {}".format(np.mean(scores)))
def __init__(self, num_agents, state_size, action_size, random_seed=2018): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') self.update = UPDATE_EVERY self.updates = NUMBER_OF_UPDATES # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
def __init__(self, continuous=True, seed=0): super(LunarLander, self).__init__() if continuous: self.env = gym.make('LunarLanderContinuous-v2') else: self.env = gym.make('LunarLander-v2') self.env.seed(seed) self.mu = Actor(s_dim=8, a_dim=4, h_dim=20)
def __init__(self, state_size, action_size, n_agents, random_seed=1): self.actor_local = Actor(state_size, action_size, random_seed) self.actor_target = Actor(state_size, action_size, random_seed) self.ddpg_agents = [ DDPGAgent(state_size, action_size, self.actor_local, self.actor_target, random_seed) for _ in range(n_agents) ]
def __init__(self, sess, dimo, dimu, u_bound, critic_lr, actor_lr, critic_l2, clip_norm, tau, layer_norm, noisy_layer, gamma, memory_size, exploration, batch_size, env_dt): self._sess = sess self._dimo = dimo self._dimu = dimu self._critic_l2 = critic_l2 self._actor_lr = actor_lr self._critic_lr = critic_lr self._clip_norm = clip_norm self._noisy = noisy_layer self._gamma = gamma self._tau = tau self._batch_size = batch_size self._u_bound = u_bound self._global_step = tf.train.get_or_create_global_step() self.ou_noise = OUNoise(dim=dimu, n_step_annealing=exploration, dt=env_dt) self._memory = ReplayMemory(memory_size) with tf.variable_scope('inputs'): self._obs = tf.placeholder(tf.float32, [None, self._dimo], name='state') self._u = tf.placeholder(tf.float32, [None, self._dimu], name='action') self._t_obs = tf.placeholder(tf.float32, [None, self._dimo], name='target_state') with tf.variable_scope('actor'): self._actor = Actor('main', self._obs, dimu, layer_norm, noisy_layer) self._target_actor = Actor('target', self._t_obs, dimu, layer_norm, noisy_layer) with tf.variable_scope('critic'): self._critic = Critic('main', self._obs, self._u, layer_norm, noisy_layer) self._critic_pi = Critic('main', self._obs, U.scaling(self._actor.pi, -1.0, 1.0, self._u_bound['low'], self._u_bound['high']), layer_norm, noisy_layer, reuse=True) self._target_critic = Critic( 'target', self._t_obs, U.scaling(self._target_actor.pi, -1.0, 1.0, self._u_bound['low'], self._u_bound['high']), layer_norm, noisy_layer) self._build_train_method() self._update_target_op = self._update_target_networks()
def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, state_dim, action_dim, action_lim, update_type='soft', lr_actor=1e-4, lr_critic=1e-3, tau=1e-3, mem_size=1e6, batch_size=256, gamma=0.99, other_cars=False, ego_dim=None): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.joint_model = False if len(state_dim) == 3: self.model = ActorCriticCNN(state_dim, action_dim, action_lim) self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor) self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim) self.target_model.load_state_dict(self.model.state_dict()) self.model.to(self.device) self.target_model.to(self.device) self.joint_model = True else: self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor) self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_actor.eval() self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2) self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.target_critic.eval() self.actor.to(self.device) self.target_actor.to(self.device) self.critic.to(self.device) self.target_critic.to(self.device) self.action_lim = action_lim self.tau = tau # hard update if tau is None self.update_type = update_type self.batch_size = batch_size self.gamma = gamma if self.joint_model: mem_size = mem_size//100 self.memory = Memory(int(mem_size), action_dim, state_dim) mu = np.zeros(action_dim) sigma = np.array([0.5, 0.05]) self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.initialised = True self.training = False
def __init__(self, device, memory, state_size, action_size, low_bound, high_bound, folder, config): self.folder = folder self.config = config self.device = device self.memory = memory self.state_size = state_size self.action_size = action_size self.low_bound = low_bound self.high_bound = high_bound self.critic = Critic(state_size, action_size, device, self.config) self.actor = Actor(state_size, action_size, low_bound, high_bound, device, self.config)
def __init__( self, state_size, action_size, sample_batch_size, memory_size=int(1e5), # replay buffer size batch_size=128, # minibatch size gamma=0.99, # discount factor tau=1e-3, # for soft update of target parameters update_every=10, lr_actor=1e-4, lr_critic=1e-3, random_seed=2): self.sample_batch_size = sample_batch_size self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.params = { "lr_actor": lr_actor, "lr_critic": lr_critic, "gamma": gamma, "tau": tau, "memory_size": memory_size, "batch_size": batch_size, "optimizer": "adam" } self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed) # Noise process self.noise = OUNoise([sample_batch_size, action_size], random_seed) self.learn_steps = 0 self.update_every = update_every
def __init__(self, device, state_size, action_size, folder, config): self.folder = folder self.config = config self.device = device self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"]) self.state_size = state_size self.action_size = action_size self.critic = Critic(self.state_size, self.action_size, self.device, self.config) self.actor = Actor(self.state_size, self.action_size, self.device, self.config)
def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] # self.a_dim = env.action_space.shape[0] self.a_dim = env.action_space2.shape[0] # self.a_dim = 1 self.env = env # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.mu = Actor(self.s_dim, self.a_dim, env.action_space2, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer()
class DDPG_agent(nn.Module): def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" action = self.actor_target(state) return action def reset(self): """ Resets noise """ self.noise.reset()
def __init__(self, policy: str, action_dim: int, max_action: float, lr: float, discount: float, noise_clip: float, policy_noise: float, policy_freq: int, actor_rng: jnp.ndarray, critic_rng: jnp.ndarray, sample_state: np.ndarray): self.discount = discount self.noise_clip = noise_clip self.policy_noise = policy_noise self.policy_freq = policy_freq self.max_action = max_action self.td3_update = policy == 'TD3' self.actor = hk.transform(lambda x: Actor(action_dim, max_action)(x)) actor_opt_init, self.actor_opt_update = optix.adam(lr) self.critic = hk.transform(lambda x: Critic()(x)) critic_opt_init, self.critic_opt_update = optix.adam(lr) self.actor_params = self.target_actor_params = self.actor.init( actor_rng, sample_state) self.actor_opt_state = actor_opt_init(self.actor_params) action = self.actor.apply(self.actor_params, sample_state) self.critic_params = self.target_critic_params = self.critic.init( critic_rng, jnp.concatenate((sample_state, action), 0)) self.critic_opt_state = critic_opt_init(self.critic_params) self.updates = 0
def __init__(self, env_name: str, threads: int, episodes: int, entropy_weight: float, learning_rate: Union[ float, tf.keras.optimizers.schedules.LearningRateSchedule], discount_factor: float): self.env_name = env_name env = gym.make(env_name) self.save_dir = os.path.expanduser('~/keras-a3c/models/') self.threads = threads self.EPISODES = episodes self.entropy_weight = entropy_weight self.learning_rate = learning_rate self.discount_factor = discount_factor actor = Actor(action_space_size=env.action_space.n) critic = Critic() self.global_model = ActorCriticModel(actor, critic) self.actor_loss = ActorLoss(entropy_weight) self.optimizer = tf.keras.optimizers.RMSprop(lr=learning_rate) self.global_model( tf.convert_to_tensor( np.random.random((1, env.observation_space.shape[0]))))
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [ OUNoise(action_size, random_seed, sigma=0.1) for i in range(self.num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0
class Agent(): def __init__(self, actor_size, action_size, critic_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(actor_size, action_size).to(self.device) self.actor_target = Actor(actor_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(critic_size).to(self.device) self.critic_target = Critic(critic_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.gamma = 0.95 #0.99 self.tau = 0.001 self.noise = OUNoise((action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0) def select_actions(self, state): state = torch.from_numpy(state).float().to(self.device).view(1, -1) #print(state.shape) self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.squeeze(0) self.actor.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def target_network_update(self, target_network, network, tau): for network_param, target_param in zip(network.parameters(), target_network.parameters()): target_param.data.copy_(tau * network_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, args): self.args = args self.critic = Critic(args.dim_s, args.dim_a, args.dim_h, args.device) self.actor = Actor(args.dim_s, args.dim_a, args.dim_h, args.device) def choose_action(self, s): # print("agent state:", s) return self.actor.choose_action(s) def learn(self, trans): td = self.critic.cal_td_loss(trans['s'], trans['r'], trans['s_']) self.critic.learn(trans['s'], trans['r'], trans['s_']) self.actor.learn(td, trans['a']) def save(self, path): self.critic.save(path) self.actor.save(path) def load(self, path): self.critic.load(path) self.actor.load(path)
def __init__(self, state_dim, action_dim, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim).to(device) self.critic = Critic(state_dim).to(device) self.optimizer = torch.optim.Adam( itertools.chain(self.actor.parameters(), self.critic.parameters()), LR) self.philosophers = list() for i in range(P_COUNT): self.philosophers.append(Critic(state_dim).to(device)) self.p_optimizers = [ torch.optim.Adam(p.parameters(), lr=P_LR) for p in self.philosophers ] self.update_cnt = 0
def __init__(self, args, env): self.learning_rate = args.learning_rate self.gamma = args.gamma self.lamb = args.lamb self.batch_size = args.batch_size self.step = 0 self.epochs = args.epochs self.actor = Actor() self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate) self.critic = Critic() self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate) self.env = env self.num_actions = env.num_actions self.num_states = env.num_states self.data = {'step' : [], 'reward' : [], 'losses' : []}
class DDPG: def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0) def act(self, obs, noise_scale=0.0): obs = obs.to(device) # debug noise # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device) # action = self.local_actor(obs) + noise action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device) return action def target_act(self, obs, noise_scale=0.0): obs = obs.to(device) # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device) # action = self.target_actor(obs) + noise_scale * noise action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device) return action def reset(self): self.noise.reset()
def __init__(self): self.max_action = 1 self.policy_freq = 2 self.policy_freq_it = 0 self.batch_size = 512 self.discount = 0.99 self.replay_buffer = int(1e5) self.device = 'cuda' self.state_dim = 24 self.action_dim = 2 self.max_action = 1 self.policy_noise = 0.1 self.agents = 1 self.random_period = 1e4 self.tau = 5e-3 self.replay_buffer = ReplayBuffer(self.replay_buffer) self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) # self.actor.load_state_dict(torch.load('actor2.pth')) # self.actor_target.load_state_dict(torch.load('actor2.pth')) self.noise = OUNoise(2, 32) self.critic = Critic(48, self.action_dim).to(self.device) self.critic_target = Critic(48, self.action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
def __init__(self, num_agents, state_size, action_size, random_seed=2018): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)