def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.stacked_state_size = state_size * n_agents self.stacked_action_size = action_size * n_agents # Actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Critic networks self.critic_local = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_target = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) # OUNoise self.exploration_noise = OUNoise(action_size, seed)
def __init__(self, state_size, action_size, agent_id): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] #self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size).to(self.device) self.actor_target = ActorNetwork(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) #if not agent_id: # self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False) #else: # self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.mCriticLoss = 0 self.actorLoss = 0
def __init__(self, state_size, obs_size, action_size, num_agents): super(DDPGAgent, self).__init__() #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.actor = ActorNetwork(obs_size, action_size).to(device) self.critic = CriticNetwork(state_size, action_size * num_agents).to(device) self.target_actor = ActorNetwork(obs_size, action_size).to(device) self.target_critic = CriticNetwork(state_size, action_size * num_agents).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic_optimizer = Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
def __init__(self, state_size, action_size, random_seed, agent_size=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_size = agent_size self.local_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.target_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.local_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR) self.opt_critic = optim.Adam(self.local_critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
class Actor(): def __init__(self, state_size, action_size, random_seed, learning_rate, noise, device): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.learning_rate = learning_rate self.actor_local = ActorNetwork(state_size, action_size, random_seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, random_seed).to(device) hard_update(self.actor_target, self.actor_local) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) self.noise = noise self.device = device def act(self, state, noise_factor, add_noise): """ Returns actions for given state as per given policy""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += (noise_factor * self.noise.sample()) return np.clip(action, -1, 1) def reset(self): self.noise.reset()
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # create the local and target actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) # create the local and target critic networks self.critic_local = CriticNetwork(state_size, action_size, seed).to(device) self.critic_target = CriticNetwork(state_size, action_size, seed).to(device) # optimizers for local actor and critic self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0) # MSE loss for updating the critic self.critic_loss_function = nn.MSELoss() # ensure that the local and target networks are initialized with the same random weights for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(param.data) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # init the noise class to sample from self.noise = GaussianNoise(self.action_size)
def create_model(context): config = tf.ConfigProto() config.gpu_options.allow_growth = True context.sess = tf.InteractiveSession(config=config) context.actor = ActorNetwork( context.sess, [len(context.assets), context.n, len(context.features)], len(context.assets) + 1, context.actor_learning_rate, context.tau, context.minibatch_size) context.critic = CriticNetwork( context.sess, [len(context.assets), context.n, len(context.features)], len(context.assets) + 1, context.critic_learning_rate, context.tau, context.gamma, context.actor.get_num_trainable_vars()) # Inicializar las variables de Tensorflow context.sess.run(tf.global_variables_initializer()) context.saver = tf.train.Saver() # Inicializar los pesos de las redes objetivo context.actor.update_target_network() context.critic.update_target_network()
class Learner: def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) self.actor.share_memory() self.critic.share_memory() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr) def learning(self): torch.manual_seed(self.opt.seed) while True: # batch-trace states, actions, rewards = self.q_batch.get(block=True) onehot_actions = torch.FloatTensor( index2onehot(actions, self.n_act)).to(self.device) # update actor network self.actor_optimizer.zero_grad() action_log_probs = self.actor(states) action_log_probs = torch.sum(action_log_probs * onehot_actions, 1) values = self.critic(states) advantages = rewards - values.detach() pg_loss = -torch.sum(action_log_probs * advantages) actor_loss = pg_loss actor_loss.backward() self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() self.critic_optimizer.step()
def __init__(self, state_size, action_size, random_seed, learning_rate, noise, device): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.learning_rate = learning_rate self.actor_local = ActorNetwork(state_size, action_size, random_seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, random_seed).to(device) hard_update(self.actor_target, self.actor_local) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) self.noise = noise self.device = device
def build_actor_critic(sess, env): w_init = tflearn.initializations.xavier_initializer() with tf.variable_scope("model", reuse=None, initializer=w_init): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=True) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=True) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() return actor, critic
def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) self.actor.share_memory() self.critic.share_memory() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)
def __init__(self, state_size, action_size, memory, seed=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size if seed is not None: self.seed = seed # create the local and target actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) # create the local and target critic networks self.critic_local = CriticNetwork(state_size, action_size, seed).to(device) self.critic_target = CriticNetwork(state_size, action_size, seed).to(device) # optimizers for local actor and critic self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0) # MSE loss for updating the critic # self.critic_loss_function = nn.MSELoss() self.critic_loss_function = nn.SmoothL1Loss() # copy the local networks weights to the target network self.copy_weights_from_local_to_target() # Replay memory self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # init the noise class to sample from self.noise = GaussianNoise(self.action_size)
def __init__(self, state_size, action_size, num_agents, hidden_in_actor=512, hidden_out_actor=256, lr_actor=1e-4, hidden_in_critic=512, hidden_out_critic=256, lr_critic=3e-4, weight_decay_critic=0, seed=1, device='cpu'): super(DDPGAgent, self).__init__() self.device = device # Actor self.actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.target_actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) # Target self.critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic) # Noise self.noise = OUNoise(action_size, seed, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic)
def build_actor_critic(sess, env): with tf.variable_scope("model", reuse=None): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=False) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=False) sess.run(tf.global_variables_initializer()) return actor, critic
class DDPGAgent: #def __init__(self, in_actor=14, hidden_in_actor=16, hidden_out_actor=8, out_actor=2, #in_critic=20, hidden_in_critic=32, hidden_out_critic=16, #lr_actor=1.0e-2, lr_critic=1.0e-2): def __init__(self, state_size, obs_size, action_size, num_agents): super(DDPGAgent, self).__init__() #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.actor = ActorNetwork(obs_size, action_size).to(device) self.critic = CriticNetwork(state_size, action_size * num_agents).to(device) self.target_actor = ActorNetwork(obs_size, action_size).to(device) self.target_critic = CriticNetwork(state_size, action_size * num_agents).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic_optimizer = Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) def act(self, obs, noise=0.0): if type(obs) == np.ndarray: obs = torch.from_numpy(obs).float().to(device) #self.actor.eval() action = self.actor(obs) action += noise * self.noise.noise() #self.actor.train() #return action.cpu().data.numpy() return action def target_act(self, obs, noise=0.0): if type(obs) == np.ndarray: obs = torch.from_numpy(obs).float().to(device) #obs = obs.to(device) #self.target_actor.eval() #action = self.target_actor(obs) + noise*self.noise.noise() action = self.target_actor(obs) action += noise * self.noise.noise() #self.target_actor.train() #return action.cpu().data.numpy() return action
def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed + actor_id) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.n_episodes = 0 self.n_steps = 0 self.gamma = opt.gamma # epsilon self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 # モデル self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device)
def build_actor_critic(sess, env, env_eval): w_init = tflearn.initializations.xavier_initializer() with tf.variable_scope("model", reuse=None, initializer=w_init): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=True) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=True) # if config.noise_std: # actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.input_cardin * env.state_cardin), # sigma=config.noise_std, # sigma_dec=config.noise_dec) # else: # actor_noise = None with tf.variable_scope("model", reuse=True): with tf.name_scope("actor"): actor_eval = ActorNetwork(sess, env_eval, config, is_training=False) with tf.name_scope("critic"): critic_eval = CriticNetwork(sess, env_eval, config, is_training=False) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() return actor, critic, actor_eval, critic_eval #, actor_noise
def __init__(self, device, key, state_size, action_size, random_seed, memory, noise, lr, weight_decay, checkpoint_folder = './Saved_Model/'): self.DEVICE = device self.KEY = key self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.LR = lr self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR) self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth' if os.path.isfile(self.checkpoint_full_name): self.local.load_state_dict(torch.load(self.checkpoint_full_name)) self.target.load_state_dict(torch.load(self.checkpoint_full_name)) # Replay memory self.memory = memory # Noise process self.noise = noise
def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.env = gym.make(self.opt.env) self.env_state = self.env.reset() self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ## パラメータ self.batch_size = opt.batch_size self.roll_out_n_steps = opt.roll_out_n_steps self.gamma = opt.gamma self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 self.n_episodes = 0 self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) # ActorNetwork self.critic = CriticNetwork(self.n_state).to(self.device) # CriticNetwork
def __init__(self, path_to_weights, weights_id): print("Init ANN") self.init = False # aviod TF from allocation all GPU mem # https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #sess = tf.Session() K.set_session(sess) self.path_to_weights = path_to_weights self.next_weights_id = weights_id #check if path is valid if not os.path.isdir(self.path_to_weights): print("Folder with weights does not exist!") print(self.path_to_weights) self.init = False else: self.init = True if self.init: #create actor network self.actor = ActorNetwork.ActorNetwork(sess, 5, 2) #load first weights print("Loading first weights for ANN from " + str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") try: self.actor.model.load_weights( str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") self.next_weights_id = self.next_weights_id + 1 except: print("Cannot find the weight (.h5) file") print(str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") self.init = False
class DDPGAgent: def __init__(self, state_size, action_size, num_agents, hidden_in_actor=512, hidden_out_actor=256, lr_actor=1e-4, hidden_in_critic=512, hidden_out_critic=256, lr_critic=3e-4, weight_decay_critic=0, seed=1, device='cpu'): super(DDPGAgent, self).__init__() self.device = device # Actor self.actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.target_actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) # Target self.critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic) # Noise self.noise = OUNoise(action_size, seed, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) def reset(self): self.noise.reset() def act(self, obs, noise_factor=0.0): if torch.is_tensor(obs): states = obs else: states = torch.from_numpy(obs).float().to(self.device) self.actor.eval() with torch.no_grad(): actions = self.actor(states).cpu().data.numpy() self.actor.train() actions += noise_factor * self.noise.sample() return np.clip(actions, -1, 1) def target_act(self, obs): if torch.is_tensor(obs): states = obs else: states = torch.from_numpy(obs).float().to(self.device) self.target_actor.eval() with torch.no_grad(): actions = self.target_actor(states).cpu().data.numpy() self.target_actor.train() return np.clip(actions, -1, 1)
def main(args): with tf.Session() as sess: env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric # assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) if args['train']: if not os.path.exists(args['save_dir']): os.makedirs(args['save_dir']) with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f: json.dump(args, f, indent=2) train(sess, env, args, actor, critic, actor_noise) else: # ddpg = [] # indexes = [e for e in range(400) if e % 10 == 9] # indexes = [0] + indexes indexes = [399] num_test_tasks = 100 buckets = 1 successes = [] directory = args['to_pickle'] for index in indexes: # times = [] task_success = [] saver = tf.train.Saver() saver.restore( sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt". format(directory, index)) for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)] success = 0 for task in tasks: s = env.reset_task(task) step = 0 d = False while not d: # env.render() action = actor.predict_target( np.reshape(s, (1, actor.s_dim)))[0] step += 1 s, r, d, _ = env.step(action) if r == 1: success += 1 # times.append(step) env.close() task_success.append(success / num_test_tasks) successes.append(task_success) # ddpg.append(times) # out = [successes, ddpg] env.close() if not os.path.exists('./pkls'): os.makedirs('./pkls') with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f: pickle.dump(successes, f)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, args): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_target = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) if not agent_id: self.actor_network.load_state_dict(torch.load( args['agent_p0_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p0_path']), strict=False) else: self.actor_network.load_state_dict(torch.load( args['agent_p1_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() input_state = torch.from_numpy(current_state).float().to( self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def train(self, experiences): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss mCritic.optimizer.zero_grad() mCritic_loss.backward() mCritic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_network(states) actor_loss = -mCritic.network(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(mCritic.network, mCritic.target, TAU) self.soft_update(self.actor_network, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Learner(object): def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) self.actor.share_memory() self.critic.share_memory() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr) def learning(self): torch.manual_seed(self.opt.seed) coef_hat = torch.FloatTensor([self.opt.coef_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step) rho_hat = torch.FloatTensor([self.opt.rho_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step) while True: # batch-trace states, actions, rewards, dones, action_log_probs = self.q_batch.get(block=True) logit_log_probs = self.actor(states) V = self.critic(states).view(self.opt.batch_size, self.opt.n_step) * (1 - dones) action_probs = torch.exp(action_log_probs) logit_probs = torch.exp(logit_log_probs) is_rate = torch.prod(logit_probs / (action_probs + 1e-6), dim=-1).detach() coef = torch.min(coef_hat, is_rate) * (1 - dones) rho = torch.min(rho_hat, is_rate) * (1 - dones) # V-trace v_trace = torch.zeros((self.opt.batch_size, self.opt.n_step)).to(self.device) target_V = V.detach() for rev_step in reversed(range(states.size(1) - 1)): v_trace[:, rev_step] = target_V[:, rev_step] \ + rho[:, rev_step] * (rewards[:, rev_step] + self.opt.gamma*target_V[:, rev_step+1] - target_V[:, rev_step]) \ + self.opt.gamma * coef[:, rev_step] * (v_trace[:, rev_step+1] - target_V[:, rev_step+1]) # actor loss onehot_actions = torch.FloatTensor( idx2onehot(actions.cpu().numpy(), self.opt.batch_size, self.n_act)).to(self.device) logit_log_probs = torch.sum(logit_log_probs * onehot_actions, dim=-1) advantages = rewards + self.opt.gamma * v_trace - V pg_loss = -torch.sum(logit_log_probs * advantages.detach()) actor_loss = pg_loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # critic critic_loss = torch.mean((v_trace.detach() - V)**2) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step()
class Actor(object): def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed + actor_id) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.n_episodes = 0 self.n_steps = 0 self.gamma = opt.gamma # epsilon self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 # モデル self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) def performing(self): torch.manual_seed(self.opt.seed) while True: self.load_model() self.train_episode() if self.n_episodes % 100 == 0: rewards = self.evaluation(self.env) rewards_mu = np.array( [np.sum(np.array(l_i), 0) for l_i in rewards]).mean() print("Episode %d, Average Reward %.2f" % (self.n_episodes, rewards_mu)) def _softmax_action(self, state): state = torch.FloatTensor([state]).to(self.device) softmax_action = torch.exp(self.actor(state)) # expをかけて,行動確率とする softmax_action = softmax_action.cpu().detach().numpy() return softmax_action def exploration_action(self, state): softmax_action = self._softmax_action(state) if np.random.rand() > self.eps_greedy: return np.argmax(softmax_action) else: return np.random.choice(self.n_act) def train_episode(self): done = False state = self.env.reset() self.env_state = state self.next_done = done while not done: self.n_steps += 1 states = np.zeros((self.opt.n_step, self.n_state)) actions = np.zeros(self.opt.n_step) rewards = np.zeros(self.opt.n_step) log_probs = np.zeros((self.opt.n_step, self.n_act)) dones = np.ones(self.opt.n_step) for i in range(self.opt.n_step): states[i] = self.env_state dones[i] = self.next_done log_prob = self.actor( torch.FloatTensor([state]).to( self.device)).detach().cpu().numpy()[0] action = self.exploration_action(state) next_state, reward, done, info = self.env.step(action) reward = 0 if done: if self.n_steps > 190: reward = 1 else: reward = -1 log_probs[i] = log_prob actions[i] = action rewards[i] = reward self.env_state = next_state self.next_done = done if done: self.env_state = self.env.reset() break # n_step回終了 if done: self.n_steps = 0 self.n_episodes += 1 self.episode_done = True else: self.episode_done = False self.q_trace.put((states, actions, rewards, dones, log_probs), block=True) # choose an action based on state for execution def action(self, state): softmax_action = self._softmax_action(state) action = np.argmax(softmax_action) return action def value(self, state): # Qを出力 state_var = torch.FloatTensor([state]).to(self.device) q_var = self.critic(state_var) # 行動価値を出value q = q_var.cpu().detach().numpy() return q def _discount_reward(self, rewards, final_value): discounted_r = np.zeros_like(rewards) R = final_value # Q(s_t, a_t) for t in reversed(range(0, len(rewards))): R = rewards[t] + self.gamma * R discounted_r[t] = R return discounted_r def evaluation(self, env_eval): rewards = [] for i in range(10): rewards_i = [] state = env_eval.reset() action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) while not done: action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) rewards.append(rewards_i) return rewards def load_model(self): try: self.actor.load_state_dict(self.learner.actor.state_dict()) self.critic.load_state_dict(self.learner.critic.state_dict()) except: print('load error')
import torch import torch.optim as optim import torch.nn.functional as F from torch.distributions import Categorical from torch.autograd import Variable import gym import numpy as np import matplotlib.pyplot as plt from model import ActorNetwork, CriticNetwork actor = ActorNetwork(4, 2) critic = CriticNetwork(4) actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4) critic_optimizer = optim.Adam(critic.parameters(), lr=8e-4) env = gym.make('CartPole-v0') GAMMA = 0.99 N_EPISODES = 20000 LOG_STEPS = 100 SAVE_STEPS = 100 def select_action(S): ''' select action based on currentr state args: S: current state returns: action to take, log probability of the chosen action '''
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, agent_size=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_size = agent_size self.local_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.target_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.local_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR) self.opt_critic = optim.Adam(self.local_critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def save_experience(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience self.memory.add(state, action, reward, next_state, done) def multi_step(self, t): # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if t % 20 == 0: for i in range(0, 10): self.learn(self.memory.sample(), GAMMA) else: pass def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.local_actor.eval() with torch.no_grad(): action = self.local_actor(state).cpu().data.numpy() self.local_actor.train() if add_noise: for a in range(0, self.agent_size): action[a] += self.noise.sample() return np.clip(action, -1, 1) # all actions between -1 and 1 def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Target and Local Critics-Actors are used to sove the moving targets problem. TargetActor generates the next action, and TargetCritic generates the corresponding Q-value. This function updates policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_t(next_state, actor_t(next_state)) Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.target_actor(next_states) Q_targets_next = self.target_critic(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.local_critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.opt_critic.zero_grad() critic_loss.backward() #use gradient clipping when training the critic network torch.nn.utils.clip_grad_norm_(self.local_critic.parameters(), 1) self.opt_critic.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.local_actor(states) actor_loss = -self.local_critic(states, actions_pred).mean() # Minimize the loss self.opt_actor.zero_grad() torch.nn.utils.clip_grad_norm_(self.local_actor.parameters(), 1) actor_loss.backward() self.opt_actor.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.local_critic, self.target_critic, TAU) self.soft_update(self.local_actor, self.target_actor, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. this function manages the update of local and target models syncing theta_target = tau*theta_local + (1 - tau)*theta_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory, seed=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size if seed is not None: self.seed = seed # create the local and target actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) # create the local and target critic networks self.critic_local = CriticNetwork(state_size, action_size, seed).to(device) self.critic_target = CriticNetwork(state_size, action_size, seed).to(device) # optimizers for local actor and critic self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0) # MSE loss for updating the critic # self.critic_loss_function = nn.MSELoss() self.critic_loss_function = nn.SmoothL1Loss() # copy the local networks weights to the target network self.copy_weights_from_local_to_target() # Replay memory self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # init the noise class to sample from self.noise = GaussianNoise(self.action_size) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for _ in range(LEARN_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) self.soft_update_all() def copy_weights_from_local_to_target(self): # ensure that the local and target networks are initialized with the same random weights # or copy you saved weights after loading into local for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(param.data) def act(self, state, add_noise=False): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) # get predicted actions for current state from actor network self.actor_local.eval() with torch.no_grad(): action_values = self.actor_local(state) self.actor_local.train() # take the predicted actions and add noise, used as exploration in a continuous environment action_values = action_values.cpu().data.numpy() if add_noise == True: action_values += self.noise.sample() return action_values def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # unpack the experiences tuple states, actions, rewards, next_states, dones = experiences # computer the loss for the actor network per the DDPG algorithm actor_local_predicted_actions = self.actor_local(states) policy_loss = -self.critic_local(states, actor_local_predicted_actions).mean() # compute the loss for the critic network per the DDPG algorithm predicted_Q_vals = self.critic_local(states, actions) predicted_actions = self.actor_target(next_states) Q_next = self.critic_target(next_states, predicted_actions) Q_targets = rewards + (gamma * Q_next * (1 - dones)) critic_loss = self.critic_loss_function(predicted_Q_vals, Q_targets) # update the networks self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def soft_update_all(self): # and soft update the target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter # use percent tau local_param.data and rest target_param.data """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.stacked_state_size = state_size * n_agents self.stacked_action_size = action_size * n_agents # Actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Critic networks self.critic_local = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_target = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) # OUNoise self.exploration_noise = OUNoise(action_size, seed) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # Add exploration noise action += self.exploration_noise.sample() return np.clip(action, -1, 1) def update(self, states, current_agent_states, actions, current_agent_actions, target_next_actions, rewards, current_agent_rewards, next_states, dones, current_agent_dones, action_preds): flatten_states = torch.reshape(states, shape=(BATCH_SIZE, -1)) flatten_next_states = torch.reshape(next_states, shape=(BATCH_SIZE, -1)) flatten_actions = torch.reshape(actions, shape=(BATCH_SIZE, -1)) y = current_agent_rewards + GAMMA * self.critic_target( flatten_next_states, target_next_actions) * (1 - current_agent_dones) # Critic loss critic_loss = F.mse_loss( y, self.critic_local(flatten_states, flatten_actions)) # Critic backprop self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Actor loss actor_loss = -self.critic_local(flatten_states, action_preds).mean() # Actor backprop self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Soft updates self.update_target_network() def update_target_network(self): for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)
class Actor: def __init__(self, device, key, state_size, action_size, random_seed, memory, noise, lr, weight_decay, checkpoint_folder = './Saved_Model/'): self.DEVICE = device self.KEY = key self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.LR = lr self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR) self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth' if os.path.isfile(self.checkpoint_full_name): self.local.load_state_dict(torch.load(self.checkpoint_full_name)) self.target.load_state_dict(torch.load(self.checkpoint_full_name)) # Replay memory self.memory = memory # Noise process self.noise = noise def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.DEVICE) self.local.eval() with torch.no_grad(): action = self.local(state).cpu().data.numpy() self.local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def reset(self): self.noise.reset() def checkpoint(self): torch.save(self.local.state_dict(), self.checkpoint_full_name)