def __init__(self, state_size, action_size, random_seed, agent_size=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_size = agent_size self.local_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.target_actor = ActorNetwork(state_size, action_size, random_seed).to(device) self.local_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, random_seed).to(device) self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR) self.opt_critic = optim.Adam(self.local_critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.stacked_state_size = state_size * n_agents self.stacked_action_size = action_size * n_agents # Actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Critic networks self.critic_local = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_target = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) # OUNoise self.exploration_noise = OUNoise(action_size, seed)
def __init__(self, state_size, action_size, agent_id): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] #self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size).to(self.device) self.actor_target = ActorNetwork(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) #if not agent_id: # self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False) #else: # self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.mCriticLoss = 0 self.actorLoss = 0
def __init__(self, state_size, obs_size, action_size, num_agents): super(DDPGAgent, self).__init__() #self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) #self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.actor = ActorNetwork(obs_size, action_size).to(device) self.critic = CriticNetwork(state_size, action_size * num_agents).to(device) self.target_actor = ActorNetwork(obs_size, action_size).to(device) self.target_critic = CriticNetwork(state_size, action_size * num_agents).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic_optimizer = Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # create the local and target actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) # create the local and target critic networks self.critic_local = CriticNetwork(state_size, action_size, seed).to(device) self.critic_target = CriticNetwork(state_size, action_size, seed).to(device) # optimizers for local actor and critic self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0) # MSE loss for updating the critic self.critic_loss_function = nn.MSELoss() # ensure that the local and target networks are initialized with the same random weights for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(param.data) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # init the noise class to sample from self.noise = GaussianNoise(self.action_size)
def create_model(context): config = tf.ConfigProto() config.gpu_options.allow_growth = True context.sess = tf.InteractiveSession(config=config) context.actor = ActorNetwork( context.sess, [len(context.assets), context.n, len(context.features)], len(context.assets) + 1, context.actor_learning_rate, context.tau, context.minibatch_size) context.critic = CriticNetwork( context.sess, [len(context.assets), context.n, len(context.features)], len(context.assets) + 1, context.critic_learning_rate, context.tau, context.gamma, context.actor.get_num_trainable_vars()) # Inicializar las variables de Tensorflow context.sess.run(tf.global_variables_initializer()) context.saver = tf.train.Saver() # Inicializar los pesos de las redes objetivo context.actor.update_target_network() context.critic.update_target_network()
def __init__(self, state_size, action_size, random_seed, learning_rate, noise, device): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.learning_rate = learning_rate self.actor_local = ActorNetwork(state_size, action_size, random_seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, random_seed).to(device) hard_update(self.actor_target, self.actor_local) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) self.noise = noise self.device = device
def __init__(self, state_size, action_size, memory, seed=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size if seed is not None: self.seed = seed # create the local and target actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) # create the local and target critic networks self.critic_local = CriticNetwork(state_size, action_size, seed).to(device) self.critic_target = CriticNetwork(state_size, action_size, seed).to(device) # optimizers for local actor and critic self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0) # MSE loss for updating the critic # self.critic_loss_function = nn.MSELoss() self.critic_loss_function = nn.SmoothL1Loss() # copy the local networks weights to the target network self.copy_weights_from_local_to_target() # Replay memory self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # init the noise class to sample from self.noise = GaussianNoise(self.action_size)
def __init__(self, state_size, action_size, num_agents, hidden_in_actor=512, hidden_out_actor=256, lr_actor=1e-4, hidden_in_critic=512, hidden_out_critic=256, lr_critic=3e-4, weight_decay_critic=0, seed=1, device='cpu'): super(DDPGAgent, self).__init__() self.device = device # Actor self.actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.target_actor = ActorNetwork(state_size, hidden_in_actor, hidden_out_actor, action_size, seed).to(device) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) # Target self.critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.target_critic = CriticNetwork(state_size, action_size, num_agents, hidden_in_critic, hidden_out_critic, seed).to(device) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic) # Noise self.noise = OUNoise(action_size, seed, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic)
def build_actor_critic(sess, env): with tf.variable_scope("model", reuse=None): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=False) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=False) sess.run(tf.global_variables_initializer()) return actor, critic
def build_actor_critic(sess, env, env_eval): w_init = tflearn.initializations.xavier_initializer() with tf.variable_scope("model", reuse=None, initializer=w_init): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=True) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=True) # if config.noise_std: # actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.input_cardin * env.state_cardin), # sigma=config.noise_std, # sigma_dec=config.noise_dec) # else: # actor_noise = None with tf.variable_scope("model", reuse=True): with tf.name_scope("actor"): actor_eval = ActorNetwork(sess, env_eval, config, is_training=False) with tf.name_scope("critic"): critic_eval = CriticNetwork(sess, env_eval, config, is_training=False) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() return actor, critic, actor_eval, critic_eval #, actor_noise
def __init__(self, device, key, state_size, action_size, random_seed, memory, noise, lr, weight_decay, checkpoint_folder = './Saved_Model/'): self.DEVICE = device self.KEY = key self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.LR = lr self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR) self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth' if os.path.isfile(self.checkpoint_full_name): self.local.load_state_dict(torch.load(self.checkpoint_full_name)) self.target.load_state_dict(torch.load(self.checkpoint_full_name)) # Replay memory self.memory = memory # Noise process self.noise = noise
def build_actor_critic(sess, env): w_init = tflearn.initializations.xavier_initializer() with tf.variable_scope("model", reuse=None, initializer=w_init): with tf.name_scope("actor"): actor = ActorNetwork(sess, env, config, is_training=True) with tf.name_scope("critic"): critic = CriticNetwork(sess, env, config, is_training=True) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() return actor, critic
def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) self.actor.share_memory() self.critic.share_memory() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)
def __init__(self, path_to_weights, weights_id): print("Init ANN") self.init = False # aviod TF from allocation all GPU mem # https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #sess = tf.Session() K.set_session(sess) self.path_to_weights = path_to_weights self.next_weights_id = weights_id #check if path is valid if not os.path.isdir(self.path_to_weights): print("Folder with weights does not exist!") print(self.path_to_weights) self.init = False else: self.init = True if self.init: #create actor network self.actor = ActorNetwork.ActorNetwork(sess, 5, 2) #load first weights print("Loading first weights for ANN from " + str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") try: self.actor.model.load_weights( str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") self.next_weights_id = self.next_weights_id + 1 except: print("Cannot find the weight (.h5) file") print(str(self.path_to_weights) + "/" + str(self.next_weights_id) + "/actormodel.h5") self.init = False
def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed + actor_id) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.n_episodes = 0 self.n_steps = 0 self.gamma = opt.gamma # epsilon self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 # モデル self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device)
def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.env = gym.make(self.opt.env) self.env_state = self.env.reset() self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ## パラメータ self.batch_size = opt.batch_size self.roll_out_n_steps = opt.roll_out_n_steps self.gamma = opt.gamma self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 self.n_episodes = 0 self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) # ActorNetwork self.critic = CriticNetwork(self.n_state).to(self.device) # CriticNetwork
def main(args): with tf.Session() as sess: env = gym.make(args['env']) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric # assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) if args['train']: if not os.path.exists(args['save_dir']): os.makedirs(args['save_dir']) with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f: json.dump(args, f, indent=2) train(sess, env, args, actor, critic, actor_noise) else: # ddpg = [] # indexes = [e for e in range(400) if e % 10 == 9] # indexes = [0] + indexes indexes = [399] num_test_tasks = 100 buckets = 1 successes = [] directory = args['to_pickle'] for index in indexes: # times = [] task_success = [] saver = tf.train.Saver() saver.restore( sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt". format(directory, index)) for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)] success = 0 for task in tasks: s = env.reset_task(task) step = 0 d = False while not d: # env.render() action = actor.predict_target( np.reshape(s, (1, actor.s_dim)))[0] step += 1 s, r, d, _ = env.step(action) if r == 1: success += 1 # times.append(step) env.close() task_success.append(success / num_test_tasks) successes.append(task_success) # ddpg.append(times) # out = [successes, ddpg] env.close() if not os.path.exists('./pkls'): os.makedirs('./pkls') with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f: pickle.dump(successes, f)
import torch import torch.optim as optim import torch.nn.functional as F from torch.distributions import Categorical from torch.autograd import Variable import gym import numpy as np import matplotlib.pyplot as plt from model import ActorNetwork, CriticNetwork actor = ActorNetwork(4, 2) critic = CriticNetwork(4) actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4) critic_optimizer = optim.Adam(critic.parameters(), lr=8e-4) env = gym.make('CartPole-v0') GAMMA = 0.99 N_EPISODES = 20000 LOG_STEPS = 100 SAVE_STEPS = 100 def select_action(S): ''' select action based on currentr state args: S: current state returns: action to take, log probability of the chosen action '''