def __init__(self, state_size=24, action_size=2, random_seed=0): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer( self.action_size, BUFFER_SIZE, MINI_BATCH, random_seed) #define experience replay buffer object
def __init__( self, task, actor_params={}, critic_params={}, noise_params={}, replay_memory_params={}, algo_params = {} ): # Default Params default_actor_params = {'lr': .001} default_critic_params= {'lr': .001} default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2} default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64} default_algo_params = {'gamma': .99, 'tau': .1} # Final Params final_actor_params= {**default_actor_params, **actor_params} final_critic_params={**default_critic_params, **critic_params} final_noise_params={**default_noise_params, **noise_params} final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, } final_algo_params = {**default_algo_params, **algo_params} self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, final_critic_params) self.critic_target = Critic(self.state_size, self.action_size, final_critic_params) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise( self.action_size, final_noise_params['mu'], final_noise_params['theta'], final_noise_params['sigma'] ) # Replay memory self.batch_size = final_replay_memory_params['batch_size'] self.memory = ReplayBuffer( final_replay_memory_params['buffer_size'], final_replay_memory_params['batch_size'] ) # Algorithm parameters self.gamma = final_algo_params['gamma'] # discount factor self.tau = final_algo_params['tau'] # for soft update of target parameters
def begin_play(self): self.actor = self.uobject.get_owner() self.VehicleMovement = self.actor.VehicleMovement self.replay_buffer = ReplayBuffer(max_size=50000) ue.log('Begin Play on TorchActor class') ue.log(torch.cuda.is_available()) ue.log(dir(self.uobject)) self.policy = TD3(lr, state_dim, action_dim, max_action) self.gen_target() self.last_state = [] self.last_reward = 0 self.last_action = None self.last_done = False self.frame = int(random.random() * 100) self.start_pos = self.uobject.get_actor_location() # self.actor.AutoDrive = True self.policy.load(directory, loadfilename) self.filename = "{}_{}".format(filename, self.frame) self.episode = 0 self.writer = SummaryWriter(os.path.join(directory, filename)) self.ep_frame = 0 self.ep_reward = 0 self.ep_reward_avg_BEST = 0
def __init__(self, config, file_prefix=None): self.buffer_size = config.hyperparameters.buffer_size self.batch_size = config.hyperparameters.batch_size self.update_frequency = config.hyperparameters.update_frequency self.gamma = config.hyperparameters.gamma self.number_of_agents = config.environment.number_of_agents self.noise_weight = config.hyperparameters.noise_start self.noise_decay = config.hyperparameters.noise_decay self.memory = ReplayBuffer(config) self.t = 0 self.agents = [ DDPGAgent(index, config) for index in range(self.number_of_agents) ] if file_prefix: for i, to_load in enumerate(self.agents): f"{os.getcwd()}/models/by_score/{file_prefix}_actor_{i}.weights" actor_file = torch.load( f"{os.getcwd()}/models/by_score/{file_prefix}_actor_{i}.weights", map_location='cpu') critic_file = torch.load( f"{os.getcwd()}/models/by_score/{file_prefix}_critic_{i}.weights", map_location='cpu') to_load.actor_local.load_state_dict(actor_file) to_load.actor_target.load_state_dict(actor_file) to_load.critic_local.load_state_dict(critic_file) to_load.critic_target.load_state_dict(critic_file) print(f'Files loaded with prefix {file_prefix}')
def init_td3(self): self.policy = TD3( self.state_dim, self.action_dim, self.cnf.td3.max_action, ) self.buffer = ReplayBuffer(self.state_dim, self.action_dim)
class Agent(): # needs functions init, choose_action, store_transition def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions, alpha, beta, batch_size=100, max_size=1e6, mu=0, sigma=0.1, clip=0.5): self.input_dims = input_dims self.n_actions = n_actions self.alpha = alpha self.beta = beta self.clip = clip self.batch_size = batch_size self.max_size = max_size self.noise = gauss(mu, sigma) #self.clamp = max(0.5, x)? self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions, 'actor_net') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'critic_net') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'target_critic') self.memory = ReplayBuffer(self.max_size, input_dims, n_actions, batch_size=self.batch_size) def choose_action(self, observation): self.actor.eval() state = T.Tensor([observation], dtype=T.float).to(self.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.Tensor(self.noise(), dtype=T.float).to(self.actor.device) mu_prime = np.min(self.clip, mu_prime) mu_prime = np.max(self.clip, mu_prime) self.actor.train() return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done)
def __init__(self, state_dim, n_actions, gamma=0.99, lmbda=1.0, eps=1e-3, itr_target_update=1e1, device="cuda"): """Train a Q-net (using double Q trick) on (state, action, reward, state, action) pairs. This is thus on 'on policy' Q-net Args: state_dim (tuple): Shape of obseravtion input n_actions (int): length of our discrete action spacce gamma (float, optional): discount factor. Defaults to 0.99. lmbda (float, optional): Random Network Distrillation weight in A_strat. Defaults to 1.0. eps (float, optional): Min value for A_strat. To avoid potential divide-by-zero errors in training with A-strat weights. Defaults to 1e-3. itr_target_update (int, optional): Number of SARSA updates after which we update the target network. Defaults to 1e1. device (str, optional): Whether we use GPU or CPU. Defaults to "cuda". """ self.q_net = Net(state_dim, n_actions).to(device) self.q_net_opt = optim.Adam(self.q_net.parameters(), lr=0.001) self.target_q_net = Net(state_dim, n_actions).to(device) self.itr_target_update = itr_target_update self.lmbda = lmbda self.count = 0 self.gamma = gamma self.eps = eps self.device = device self.loss_func = nn.MSELoss() self.memory = ReplayBuffer(1e4, 64)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, action_size, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # Create the multi agent as a list of ddpg agents self.maddpg_agents = [AgentDDPG(24, 2, 0), AgentDDPG(24, 2, 0)] self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.total_reward = 0.0 self.count = 0 self.update_every = 1 self.batch_size = 128 self.agent_number = len(self.maddpg_agents) self.t_step = 0 # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.action_size = action_size self.total_reward = np.zeros((1, 2)) # Initialize the Gaussian Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) self.critic = DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self.target_critic = DDPGValueNet( feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self._copy_para(self.critic.model, self.target_critic.model) self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self._copy_para(self.actor, self.target_actor) self.ema = tf.train.ExponentialMovingAverage(decay=1.0 - self.parms.tau)
def __init__(self, k_level, H, state_dim, action_dim, render, threshold, action_bounds, action_offset, state_bounds, state_offset, lr): # adding lowest level self.HAC = [ DDPG(state_dim, action_dim, action_bounds, action_offset, lr, H) ] self.replay_buffer = [ReplayBuffer()] # adding remaining levels for _ in range(k_level - 1): self.HAC.append( DDPG(state_dim, state_dim, state_bounds, state_offset, lr, H)) self.replay_buffer.append(ReplayBuffer()) # set some parameters self.k_level = k_level self.H = H self.action_dim = action_dim self.state_dim = state_dim self.threshold = threshold self.render = render # logging parameters self.goals = [None] * self.k_level self.reward = 0 self.timestep = 0
def __init__(self, params: Parameters, obs_shapes, a_shapes, a_bounds, a_shape_index): self.index = a_shape_index self.parms = params self.buffer = ReplayBuffer(params.replay_size) # Critic self.critic = DDPGValueNet(feature_shapes=obs_shapes, a_shapes=a_shapes,lr=params.lr_c, n_agent=params.n_agent) self.target_critic = DDPGValueNet(feature_shapes=obs_shapes, a_shapes=a_shapes,lr=params.lr_c, n_agent=params.n_agent) self._copy_para(self.critic.model, self.target_critic.model) # Actor self.actor = CtsPolicy(action_bound=a_bounds, s_shape=obs_shapes[a_shape_index], a_shape=a_shapes[a_shape_index], lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=a_bounds, s_shape=obs_shapes[a_shape_index], a_shape=a_shapes[a_shape_index], lr=params.lr_a) self._copy_para(self.actor, self.target_actor)
def __init__(self, env_id, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, warmup=1000, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=256, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_target_critic_2') self.noise = noise self.update_network_parameters(tau=1)
def begin_play(self): self.actor = self.uobject.get_owner() self.VehicleMovement = self.actor.VehicleMovement self.replay_buffer = ReplayBuffer(max_size=50000) ue.log('Begin Play on TorchActor class') ue.log(torch.cuda.is_available()) self.policy = TD3(lr, state_dim, action_dim, max_action) self.gen_target() self.last_state = [] self.last_reward = 0 self.last_action = None self.last_done = False self.frame = int(random.random() * 100) self.start_pos = self.uobject.get_actor_location() self.policy.load(directory, loadfilename) self.episode = 0 self.ep_frame = 0 self.ep_reward = 0 self.policy = master.policy self.boredom = 0.8 print("MASTER") print(master) self.my_id = master.get_id() self.actor.TextRender.call('SetText {}'.format(self.my_id))
def __init__(self, state_size, action_size, policy_network, value_network, n_agents, device, use_gae=True): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.device = device self.policy_network = policy_network( state_size=state_size, action_size=action_size).to(device) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=LR) self.value_network = value_network(state_size=state_size, action_size=1).to(device) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=LR) self.epsilon = EPSILON self.beta = BETA self.reset_memory() self.buffer = ReplayBuffer(int(128), 64) self.use_gae = use_gae
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, n_actions=2, warmup=1000, max_size=1e6, layer1_size=400, layer2_size=300, batch_size=100, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low #self.max_action = n_actions #self.min_action = 0 self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 # how often to call the learning function on the actor network self.time_step = 0 # handles countdown to end of warmup self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions, 'actor_net') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, 'critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, 'critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions, name='target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, name='target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions, name='target_critic_2') self.noise = noise self.update_network_parameters( tau=1) # sets the target network parameters to original
class DDPG: def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim, gamma, batch_size): self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target') self.s = tf.placeholder(tf.float32, [None, state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state') self.memory = ReplayBuffer(max_size=10000) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) self.batch_size = batch_size self.gamma = gamma self.sess = tf.Session() self.actor = Actor(self.sess, self.s, self.s_, action_dim, action_bound, tau, lr_a, f1_units=300) self.critic = Critic(self.sess, lr_c, self.s, self.s_, self.actor.a, self.actor.a_, self.target, tau, gamma, state_dim, action_dim, f1_units=300) self.actor.add_grad_to_graph(self.critic.a_g) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): a = self.actor.choose_action(s) var = self.noise() a = a + var return a[0] def update_target_networks(self): self.sess.run([self.actor.replace, self.critic.replace]) def store(self, s, a, r, s_, done): self.memory.store(s, a, r, s_, done) def learn(self): bs, ba, br, bs_, _ = self.memory.sample(self.batch_size) q_ = self.sess.run(self.critic.q_, {self.s_: bs_}) br = br[:, np.newaxis] target_critic = br + self.gamma * q_ self.critic.learn(bs, ba, target_critic) self.actor.learn(bs) self.update_target_networks()
def __init__(self, state_manager : StateManager, actor : Actor): self.state_manager = state_manager #pool of active nodes self.node_pool = None #ANET self.actor = actor #buffer self.replay_buffer = ReplayBuffer(self.state_manager.tree_distribution_converter)
def __init__(self, state_size, action_size, num_agents, random_seed=0): in_critic = num_agents * state_size self.agents = [ DDPG_agent(state_size, in_critic, action_size, num_agents, random_seed) for i in range(num_agents) ] self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents
def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = ReplayBuffer(buffer_size) self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters()) self.dqn_loss = torch.nn.MSELoss()
def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt): super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec) self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape) self.learn_step_counter = 0 self.replace_cnt = replace_cnt self.q_eval = ConvDQN(env.env_shape, env.no_of_actions) self.q_target = ConvDQN(env.env_shape, env.no_of_actions)
def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size): super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec) if self.network == "SimpleConvDQN": self.model = ConvDQN(env.env_shape, env.no_of_actions) elif self.network == "LinearDQN": self.model = LinearDQN(env.env_shape, env.no_of_actions) self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)
def __init__(self, obs_dim, action_dim, *args, **kwargs): # Initialize arguments hidden_dims_actor = tuple(kwargs.get("hidden_dims_actor", (256, 256))) hidden_dims_critic = tuple(kwargs.get("hidden_dims_critic", (256, 256))) hidden_dims_model = tuple(kwargs.get("hidden_dims_model", (256, 256))) self.gamma = 0.99 self.tau = 0.005 self.delay = 2 lr_actor = 0.001 lr_critic = 0.001 lr_model = 0.0001 self.step_random = 500 # How many random actions to take before using actor for action selection self.update_every_n_steps = 51 # How often to update model, actor and critics self.update_steps = 200 # How many gradient updates to perform, per model, when updating self.time = time.time() # Initialize actor self.actor = Actor(obs_dim, hidden_dims_actor, action_dim) self.actor_target = copy.deepcopy(self.actor) self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) for par in self.actor_target.parameters(): par.requires_grad = False # Initialize 2 critics self.critics = [] self.critics_target = [] self.optimizer_critics = [] for k in range(2): critic = Critic(obs_dim + action_dim, hidden_dims_critic) self.critics.append(critic) self.critics_target.append(copy.deepcopy(critic)) self.optimizer_critics.append(torch.optim.Adam(critic.parameters(), lr=lr_critic)) for par in self.critics_target[k].parameters(): par.requires_grad = False # Initialize models self.models = [] self.optimizer_models = [] for k in range(25): model = Model(obs_dim + action_dim, hidden_dims_model, obs_dim) self.models.append(model) self.optimizer_models.append(torch.optim.Adam(model.parameters(), lr=lr_model)) # Setup Replay Buffer self.buffer = ReplayBuffer() self.o_old = None self.a_old = None self.step_i = 0
class TorchDriverMaster: tester="hello" # this is called on game start def begin_play(self): global master master = self self.replay_buffer = ReplayBuffer(max_size=50000) ue.log('Begin Play on TorchActor class') ue.log("Has CUDA: {}".format(torch.cuda.is_available())) self.policy = TD3(lr, state_dim, action_dim, max_action) self.frame = 0 self.policy.load(directory, loadfilename) self.episode = 0 self.worker_id = 0 self.writer = SummaryWriter(os.path.join(directory, filename)) def get_next_ep(self): self.episode += 1 return self.episode def get_id(self): retid = self.worker_id self.worker_id += 1 return retid def write_data(self,ep_reward, ep_reward_avg): real_ep = self.episode self.writer.add_scalar('ep_reward', ep_reward, real_ep) self.writer.add_scalar('ep_avg_reward', ep_reward_avg, real_ep) print("finished ep {}, avgscore: {}".format(real_ep, ep_reward_avg)) self.episode += 1 def transfer_buffer(self, buffer): self.replay_buffer.mergein(buffer) print("buffer merged, length: {}".format(self.replay_buffer.size)) def tick(self, delta_time): self.frame += 1 if self.replay_buffer.size: al, c1l, c2l, prl = self.policy.update(self.replay_buffer, 1, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay) if self.frame % 60 == 0: print("aloss:{}".format(al)) if self.frame % 600 == 0: self.policy.save(directory, filename)
def __init__(self, multi_step_env: MultiStepEnv = None, gamma: float = None, eps_max: float = None, eps_min: float = None, eps_decay_steps: int = None, replay_min_size: int = None, replay_max_size: int = None, target_update_freq: int = None, steps_per_update: int = None, train_batch_size: int = None, enable_rgb: bool = None, model_save_file: str = None, optim_l2_reg_coeff: float = None, optim_lr: float = None, eval_freq: int = None): self.env = multi_step_env self.gamma = gamma self.eps_max = eps_max self.eps_min = eps_min self.eps_decay_steps = eps_decay_steps self.replay_min_size = replay_min_size self.target_update_freq = target_update_freq self.train_batch_size = train_batch_size self.steps_per_update = steps_per_update self.model_save_file = model_save_file self.optim_lr = optim_lr self.optim_l2_reg_coeff = optim_l2_reg_coeff self.eval_freq = eval_freq self.replay_memory = ReplayBuffer(capacity=replay_max_size) self.n_steps = 0 if enable_rgb: self.q_train = Q(self.env.frame_stack_size * 3, self.env.height, self.env.width, self.env.num_actions).to(settings.device) self.q_target = Q(self.env.frame_stack_size * 3, self.env.height, self.env.width, self.env.num_actions).to(settings.device) else: self.q_train = Q(self.env.frame_stack_size, self.env.height, self.env.width, self.env.num_actions).to(settings.device) self.q_target = Q(self.env.frame_stack_size, self.env.height, self.env.width, self.env.num_actions).to(settings.device) self.optimizer = Adam(self.q_train.parameters(), eps=1e-7, lr=self.optim_lr, weight_decay=self.optim_l2_reg_coeff) # self.mse_loss = nn.MSELoss() assert (self.q_train.state_dict().keys() == self.q_target.state_dict().keys())
def __init__(self, env, save_dirs, save_freq=10000, gamma=0.99, batch_size=32, learning_rate=0.0001, buffer_size=10000, learn_start=10000, target_network_update_freq=1000, train_freq=4, epsilon_min=0.01, exploration_fraction=0.1, tot_steps=int(1e7)): DDQN.__init__(self, env=env, save_dirs=save_dirs, learning_rate=learning_rate) self.gamma = gamma self.batch_size = batch_size self.learning_rate = learning_rate self.buffer_size = buffer_size self.learn_start = learn_start self.target_network_update_freq = target_network_update_freq self.train_freq = train_freq self.epsilon_min = epsilon_min self.exploration_fraction = exploration_fraction self.tot_steps = tot_steps self.epsilon = 1.0 self.exploration = LinearSchedule(schedule_timesteps=int( self.exploration_fraction * self.tot_steps), initial_p=self.epsilon, final_p=self.epsilon_min) self.save_freq = save_freq self.replay_buffer = ReplayBuffer(save_dirs=save_dirs, buffer_size=self.buffer_size, obs_shape=self.input_shape) self.exploration_factor_save_path = os.path.join( self.save_path, 'exploration-factor.npz') self.target_model_save_path = os.path.join(self.save_path, 'target-wts.h5') self.target_model = NeuralNet(input_shape=self.input_shape, num_actions=self.num_actions, learning_rate=learning_rate, blueprint=self.blueprint).model self.show_hyperparams() self.update_target() self.load()
def begin_play(self): self.actor = self.uobject.get_owner() self.replay_buffer = ReplayBuffer(max_size=50000) ue.log('Begin Play on TorchWalkerMinion class') #self.policy = TD3(lr, state_dim, action_dim, max_action) self.gen_target() self.last_state = [] self.last_reward = 0 self.last_action = None self.last_done = False self.frame = int(random.random() * 100) self.start_pos = self.uobject.get_actor_location() self.episode = 0 self.ep_frame = 0 self.ep_reward = 0 self.total_frame = 0 self.boredom = 0.8 print("MASTER") print(master) actionlen = self.actor.get_action_dim() TEMP_OBS = self.actor.update_observation()[0] print("TEMP_OBS") print(TEMP_OBS) obslen = len(TEMP_OBS) print(obslen) master.init_network(obslen+1, actionlen) self.my_id = master.get_id() #self.actor.TextRender.call('SetText {}'.format(self.my_id)) self.random_frames = 10 self.bg_thread = None self.exploration_noise = random.random()*0.3 self.first_frame = True self.policy = master.policy self.action_space_low = [-1 for x in range(master.action_dim)] self.action_space_high = [1 for x in range(master.action_dim)] self.obs_space_low = [-1 for x in range(master.state_dim)] self.obs_space_high = [1 for x in range(master.state_dim)]
def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action cfg (config object): main configuration with other passed settings num_agents (int): optional (default: 1). If >1 will multiply state and action space sizes for critic. Used for usage with MADDPG. agent_id (int): optional (default: 0). Set agent id for MADDPG. """ print("Initializing single DDPG agent!") self.state_size = state_size self.action_size = action_size self.seed = random.seed(cfg.random_seed) self.n_agents = num_agents self.agent_id = agent_id self.cfg = cfg # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_target = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=cfg.lr_critic, weight_decay=cfg.weight_decay) self.hard_copy_weights(self.critic_local, self.critic_target) self.hard_copy_weights(self.actor_local, self.actor_target) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # Replay memory self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg)
def reset_agent(self): self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs) self.step_count = 0 self.total_step_count = 0 self.train_count = 0 self.episode_count = 0 self.sess.run(tf.global_variables_initializer()) self.sess.run(self.T_init)