def loadagent(ckp_name, *args, **kargs): agent = DDPGAgent(*args, **kargs) agent.path = ckp_name actor_state_dict, critic_state_dict = torch.load(ckp_name) agent.actor_local.load_state_dict(actor_state_dict) agent.actor_target.load_state_dict(actor_state_dict) agent.critic_local.load_state_dict(critic_state_dict) agent.critic_target.load_state_dict(critic_state_dict) agent.lr_actor *= agent.lr_decay agent.lr_critic *= agent.lr_decay return agent
def __init__(self, state_size, action_size, seed, discount_factor=GAMMA, tau=TAU): super(MADDPG, self).__init__() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # critic input = obs_full + actions = 14+2+2+2=20 self.maddpg_agent = [ DDPGAgent(state_size, action_size, seed), DDPGAgent(state_size, action_size, seed) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.t_step = 0
def __init__(self, env, num_agents, alpha, beta, tau, input_dims, n_actions, hd1_dims=400, hd2_dims=300, mem_size=1000000, gamma=0.99, batch_size=64): self.env = env self.num_agents = int(num_agents) self.alpha = alpha self.beta = beta self.gamma = gamma self.tau = tau self.batch_size = batch_size self.agents = [ DDPGAgent(alpha=self.alpha, beta=self.beta, tau=self.tau, input_dims=input_dims, n_actions=n_actions, hd1_dims=hd1_dims, hd2_dims=hd2_dims, mem_size=mem_size, gamma=self.gamma, batch_size=self.batch_size, agent_no=i) for i in range(self.num_agents) ] self.agents_states = [] self.local_agent_states = [[] for i in range(self.num_agents)]
def __init__(self, state_size, action_size, memory, num_agents, config): super(DDPGMultiAgent, self).__init__() self.commonMemory = memory # Replay memory #Hyper Params self.random_seed = config["SEED"] self.gamma = config["GAMMA"] self.tau = config["TAU"] self.lrActor = config["LR_ACTOR"] self.lrCritic = config["LR_CRITIC"] self.mu = config["MU"] self.theta = config["THETA"] self.sigma = config["SIGMA"] self.isNNHardcopy = False self.explorfactor = config["EXPLORE"] self.micro_batch_size = config["BATCH_SIZE"] self.num_agents = num_agents self.action_size = action_size self.state_size = state_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, self.random_seed).to(device) self.actor_target = Actor(state_size, action_size, self.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lrActor) # Critic Network (w/ Target Network) #self.critic_local = Critic(state_size, action_size, self.random_seed).to(device) #self.critic_target = Critic(state_size, action_size, self.random_seed).to(device) #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lrCritic, weight_decay=0.) #self.multiagent = [DDPGAgent (state_size,action_size,self.actor_local, self.actor_target,self.actor_optimizer,self.critic_local,self.critic_target,self.critic_optimizer,p_gamma=self.gamma,p_lr_actor= self.lrActor,p_lr_critic= self.lrCritic,p_seed=self.p_seed,p_mu=self.mu,p_theta=self.theta, p_sigma=self.sigma,p_tau=self.tau ,p_targetcopy=self.isNNHardcopy,p_explore=self.explorfactor) for agent in range(num_agents)] #Actor is Common for Both Agents and Target for each Agent self.multiagent = [ DDPGAgent(state_size, action_size, self.actor_local, self.actor_target, self.actor_optimizer, config) for agent in range(num_agents) ] # Noise process self.noise = OUNoise(action_size, self.random_seed, self.mu, self.theta, self.sigma) #Trained mode = true , copy local NN weights to target NN if (self.isNNHardcopy): self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) print( "HP :Gamma :{} , TAU:{}, LR_Act :{} , LR_Critic {} , Mu {}, Theta {}, Sigma {}, ExploreFactor {}, IsTargetHardcopy{}" .format(self.gamma, self.tau, self.lrActor, self.lrCritic, self.mu, self.theta, self.sigma, self.explorfactor, self.isNNHardcopy))
def play(): env = UnityEnvironment(file_name='./Reacher.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) # create agent agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0) # load weights agent.policy_local.load_state_dict(torch.load('policy.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state (for each agent) score = 0 # initialize the score (for each agent) while True: action = agent.act(state, add_noise=False) # select an action (for each agent) env_info = env.step(action)[brain_name] # send all actions to tne environment next_state = env_info.vector_observations[0] # get next state (for each agent) reward = env_info.rewards[0] # get reward (for each agent) done = env_info.local_done[0] # see if episode finished score += reward # update the score (for each agent) state = next_state # roll over states to next time step if done: # exit loop if episode finished break print('Total score (averaged over agents) this episode: {}'.format(score)) env.close()
def __init__(self, config): self.config = config if config.shared_replay_buffer: self.memory = config.memory_fn() self.config.memory = self.memory self.ddpg_agents = [ DDPGAgent(self.config) for _ in range(config.num_agents) ] self.t_step = 0
def __init__(self): self.env = make_env(scenario_name='scenarios/new_env')#'simple_spread') self.num_agents = self.env.n self.agents = [DDPGAgent(self.env, agent_id, actor_lr=0.0, critic_lr=0.0, gamma=1.0) for agent_id in range(self.num_agents)] for agent in self.agents: #agent.actor.load_state_dict(torch.load('./saved_weights/actor_3000.weights', map_location=torch.device('cpu'))) #agent.critic.load_state_dict(torch.load('./saved_weights/critic_3000.weights', map_location=torch.device('cpu'))) pass self.reset()
def train(train_env_id: str, eval_env_id: str, logdir: str, cfg: ExperimentConfig, save_path: str, pretrain_path: Optional[str] = None) -> DDPGAgent: pretrain = torch.load(os.path.join(pretrain_path)) \ if pretrain_path is not None \ else None env = set_env_metadata(train_env_id, cfg) train_env = make_vec_env(train_env_id, num_envs=cfg.episodes_per_cycle, no_timeout=True, seed=cfg.seed) eval_env = make_vec_env(eval_env_id, num_envs=cfg.num_eval_envs, no_timeout=True, seed=cfg.seed + 100) replay = HERReplayBuffer(cfg=cfg) tf_logger = TensorboardLogger(logdir) actor = ActorNet(obs_dim=cfg.obs_dim, goal_dim=cfg.goal_dim, action_dim=cfg.action_dim, action_range=cfg.action_range, zero_last=(pretrain_path is not None)) critic = CriticNet(obs_dim=cfg.obs_dim, goal_dim=cfg.goal_dim, action_dim=cfg.action_dim, action_range=cfg.action_range) normalizer = Normalizer(cfg.obs_dim+cfg.goal_dim) \ if pretrain is None \ else pretrain.normalizer agent = DDPGAgent(cfg=cfg, actor=actor, critic=critic, normalizer=normalizer, reward_fn=env.compute_reward, pretrain=getattr(pretrain, 'actor', None)) engine = DDPGEngine(cfg=cfg, agent=agent, train_env=train_env, eval_env=eval_env, replay=replay, tf_logger=tf_logger) engine.train() env.close() train_env.close() eval_env.close() torch.save(agent, os.path.join(save_path)) return agent
def __init__(self, arg, memory): """ Args: param1: (arg) command line arguments parameter param2: (ReplayBuffer) saves experience """ self.memory = memory self.discount = arg.discount self.batch_size = arg.batch_size self.update_every = arg.update_every self.ddpg_agents = [ DDPGAgent(arg, memory) for _ in range(arg.num_agents) ] self.t_step = 0
def main(args): with open(args.param, "r") as f: config = json.load(f) config["locexp"] = args.locexp path = args.locexp # experiment_name = args.experiment_name vid_path = os.path.join(path, "videos-{}".format(args.seed)) if not os.path.exists(vid_path): os.makedirs(vid_path) res_path = os.path.join(path, "results") if not os.path.exists(res_path): os.makedirs(res_path) config["vid_path"] = vid_path config["res_path"] = res_path config["seed"] = args.seed env = gym.make("LunarLanderContinuous-v2") config["max_action"] = env.action_space.high[0] config["min_action"] = env.action_space.low[0] print(str(config)) action_size = env.action_space.shape[0] state_size = env.observation_space.shape[0] agent = DDPGAgent(action_size=action_size, state_size=state_size, config=config) agent.train_agent()
def testAgent(): print("Testing the Agent") agent = DDPGAgent(state_size=state_size, action_size=action_size, n_agents=n_agents, seed=0, pretrainedWeightsFile='checkpoint_actor.pth', train=False) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state score = np.zeros(n_agents) # initialize the score while True: actions = agent.act(states) # select an action env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished score += np.array(rewards) # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break print("Score: {}".format(np.mean(score))) return score
def __init__(self, action_size, state_size, shared_replay_buffer, num_agents): self.shared_replay_buffer = shared_replay_buffer memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE, SEED, DEVICE) memory = None if shared_replay_buffer: self.memory = memory_fn() memory = self.memory self.ddpg_agents = [ DDPGAgent(action_size, state_size, shared_replay_buffer, memory) for _ in range(num_agents) ] self.t_step = 0
def main(): parser = argparse.ArgumentParser( description="Run Extended Q-Learning with given config") parser.add_argument( "-c", "--config", type=str, metavar="", required=True, help="Config file name - file must be available as .json in ./configs") args = parser.parse_args() # load config files with open(os.path.join(".", "configs", args.config), "r") as read_file: config = json.load(read_file) env = UnityEnvironment(file_name=os.path.join(*config["env_path"])) noise = OrnsteinUhlenbeckNoise(config["n_actions"], config["mu"], config["theta"], config["sigma"], config["seed"]) replay_buffer = ReplayBuffer(config["buffer_size"], config["device"], config["seed"]) agent = DDPGAgent(config, noise, replay_buffer) if config["run_training"]: session.train(agent, env, config) checkpoint_dir = os.path.join(".", *config["checkpoint_dir"], config["env_name"]) utils.save_state_dict(os.path.join(checkpoint_dir, "actor"), agent.actor.state_dict()) utils.save_state_dict(os.path.join(checkpoint_dir, "critic"), agent.critic.state_dict()) else: checkpoint_dir = os.path.join(".", *config["checkpoint_dir"], config["env_name"]) agent.actor.load_state_dict( utils.load_latest_available_state_dict( os.path.join(checkpoint_dir, "actor", "*"))) agent.critic.load_state_dict( utils.load_latest_available_state_dict( os.path.join(checkpoint_dir, "critic", "*"))) session.evaluate(agent, env, num_test_runs=1) env.close()
def ddpg_run(episodes=1000, seed=42): env = start_env() env_info = reset_env_info(env) state_size = get_state_size(env_info) action_size = get_action_size(env) print('Seed used:', seed) total_agents = get_total_agents(env_info) agent = DDPGAgent(total_agents, state_size, action_size, seed) scores = [] scores_window = deque(maxlen=100) for episode in range(1, episodes+1): init_time = datetime.datetime.now() env_info = reset_env_info(env) score = np.zeros(total_agents) dones = np.zeros(total_agents) agent.reset() critic_losses = [] actor_losses = [] while not np.any(dones): states = env_info.vector_observations actions = agent.act(states, add_noise=True) env_info = env_step(env, actions) next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done critic_loss, actor_loss = agent.step(states, actions, rewards, next_states, dones) critic_losses.append(critic_loss) actor_losses.append(actor_loss) #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='') score += rewards scores_window.append(np.mean(score)) scores.append(np.mean(score)) print('Ep. {}/{} - Avg Global Score: {:.2f} - Avg Ep. Score: {:.2f} - Min Ep. Score: {:.2f} - Max Ep. Score: {:.2f} - Actor loss: {:.6f}, Critic loss: {:.6f} - time: {}'.format(episode, episodes, np.mean(scores_window), np.mean(score), np.min(score), np.max(score), np.mean(actor_losses), np.mean(critic_losses), datetime.datetime.now() - init_time, end=' ')) if np.mean(scores_window) >= 30.0 and episode >= 100: print('\nEnvironment solved (mean of 30.0 for 100 episodes) in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'actor_local_checkpoint.pth') torch.save(agent.actor_target.state_dict(), 'actor_target_checkpoint.pth') torch.save(agent.critic_local.state_dict(), 'critic_local_checkpoint.pth') torch.save(agent.critic_target.state_dict(), 'critic_target_checkpoint.pth') break env.close() return scores
def __init__(self): super(MADDPG, self).__init__() self.env = make_env(scenario_name='simple_spread') self.num_agents = self.env.n self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, cfg.buffer_maxlen) self.agents = [ DDPGAgent(self.env, agent_id, actor_lr=cfg.actor_lr, critic_lr=cfg.critic_lr, gamma=cfg.gamma) for agent_id in range(self.num_agents) ] self.episode_rewards = list() self.episode = 0 self.episode_reward = 0 self.populate(cfg.warm_start_steps) self.states = self.env.reset() self.reset() if not os.path.exists(os.path.join(os.getcwd(), 'saved_weights')): os.mkdir(os.path.join(os.getcwd(), 'saved_weights'))
def __init__(self, state_size,action_size,memory,num_agents,seed=1,p_gamma=0.917,p_tau=0.001,p_lrAct=0.0001,p_lrCritic=0.0002,theta=0.17,sigma=0.24): super(DDPGMultiAgent,self).__init__() self.multiagent = [DDPGAgent (state_size,action_size,random_seed=seed,p_theta=theta, p_sigma=sigma) for agent in range(num_agents)] print ("Gamma :{} , LR_Act :{} , LR_Critic {} , Theta {}, Sigma {}".format(p_gamma,p_lrAct,p_lrCritic,theta,sigma)) self.commonMemory =memory # Replay memory #Hyper Params self.gamma = p_gamma self.random_seed =seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, self.random_seed).to(device) self.actor_target = Actor(state_size, action_size, self.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, self.random_seed).to(device) self.critic_target = Critic(state_size, action_size, self.random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, self.random_seed)
# number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # create agent agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0) # do training scores, avg_scores = train(env, agent) env.close() # plot results fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores, label='score') plt.plot(np.arange(len(avg_scores)), avg_scores, c='r', label='avg score') plt.ylabel('Score') plt.xlabel('Episode #') plt.legend(loc='upper left') plt.savefig('scores_20.png')
from agent import DDPGAgent, DDPGArgs from trainer import CentralizedTrainer from numeric_env import MultiEnv import torch torch.set_num_threads(1) env = MultiEnv(2, 2) args = DDPGArgs(state_dim=env.STATE_DIM, action_dim=4) agent = DDPGAgent(args) trainer = CentralizedTrainer(agent, env, log_dir='../logs/ddpg_c') trainer.train(1000000)
def __init__(self, env, buffer_maxlen): self.env = env self.num_agents = env.n self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, buffer_maxlen) self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]
dones = env_info.local_done # see if episode has finished score += np.array(rewards) # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break print("Score: {}".format(np.mean(score))) return score def plotScores(scores): # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() if args.mode == 'train': agent = DDPGAgent(state_size=state_size, action_size=action_size, n_agents=n_agents, seed=48) scores = trainAgent(agent, n_episodes=1000) plotScores(scores) elif args.mode == 'test': testAgent() else: print("Invalid Mode")
'critic_params': { # critic parameters 'norm': True, 'lr': 0.001, # learning rate 'weight_decay': 0.0, # weight decay 'state_size': state_size, # size of the state space 'action_size': action_size, # size of the action space 'seed': seedGenerator, # seed of the network architecture 'hidden_layers': [512, 512, 128], # hidden layer neurons 'dropout': 0.05, 'action_layer': 1, 'act_fn': [F.leaky_relu, F.leaky_relu, lambda x: x] }, 'noise_params': { # parameters for the noisy process 'mu': 0., # mean 'theta': 0.15, # theta value for the ornstein-uhlenbeck process 'sigma': 0.2, # variance 'seed': seedGenerator, # seed 'action_size': action_size } } } agents = [ DDPGAgent(idx=idx, params=params['agent_params']) for idx, a in enumerate(range(num_agents)) ] scores = train(agents=agents, params=params) df = pd.DataFrame(data={'episode': np.arange(len(scores)), 'DDPG-3': scores}) df.to_csv('results/DDPG-3-scores.csv', index=False)
import torch from torch import nn from torchviz import make_dot, make_dot_from_trace from graphviz import Digraph from agent import DDPGAgent x = torch.randn(5,24).cuda() y = torch.randn(5,2).cuda() agent = DDPGAgent(state_dim=24, action_dim=2) dot = make_dot(agent.critic_local(x,y), params=dict(agent.critic_local.named_parameters())) dot.format = 'png' dot.render("static/ddpg_critic_model") dot = make_dot(agent.actor_local(x), params=dict(agent.actor_local.named_parameters())) dot.format = 'png' dot.render("static/ddpg_actor_model")
from agent import DDPGAgent, DDPGArgs from trainer import DistributedTrainer from numeric_env import MultiEnv import torch torch.set_num_threads(1) env = MultiEnv(2, 2) args = DDPGArgs(state_dim=env.STATE_DIM, action_dim=2) agents = [DDPGAgent(args) for _ in range(2)] trainer = DistributedTrainer(agents, env, parameter_share=False, log_dir='../logs/ddpg_d') trainer.train(1000000)
import gym import torch import numpy as np from collections import deque import matplotlib.pyplot as plt from agent import DDPGAgent env = gym.make('BipedalWalker-v2') env.seed(0) agent = DDPGAgent(env.observation_space.shape[0], env.action_space.shape[0], 0) n_episodes = 2000 t_max = 1000 def train(n_episodes=2000, t_max=1000): score_deque = deque(maxlen=100) scores = [] for ep in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 for step in range(t_max): action = agent.act(state) next_state, reward, done, _ = env.step(action[0]) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done:
# install env to the running params configs = { 'args': args, 'env': env, 'gamma': 0.99, 'actor_lr': 0.001, 'critic_lr': 0.01, 'tau': 0.02, 'capacity': 10000, 'batch_size': 32, 'using_cuda': args.cuda > 0, } agent = DDPGAgent(**configs) # agent.show_model() if args.RUNNING_TYPE == "train": trainer = Trainer(agent, env, configs) trainer.train() elif args.RUNNING_TYPE == "retrain": episode, step = agent.load_checkpoint( args.CHECKPOINT_DIR, args.CHECKPOINT_START_EPISODE) trainer = Trainer(agent, env, configs) trainer.train(episode, step) elif args.RUNNING_TYPE == "test": tester = Tester(agent, env, './running_log/model') tester.test(True) else: print("unknown running type: ", args.RUNNING_TYPE)
def __init__(self, state_size, action_size, num_agents): super(DDPGMultiAgent, self).__init__() self.gamma = 0.997 self.multiagent = [ DDPGAgent(state_size, action_size) for agent in range(num_agents) ]
# Note: Could be extended to a cli (argparse) agent_config = { 'seed': 1, 'batch_size': 128, 'memory_size': int(1e5), 'gamma': 0.99, 'tau': 1e-3, 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'update_every': 20, 'update_iterations': 10, 'noise_decay': 0.999, 'number_agents': len(env_info.agents) } agent = DDPGAgent(state_size, action_size, agent_config) root_path = 'weights' training_config = { 'nepisodes': 500, 'nsteps': 1000, 'average_over_episodes': 100, 'target_score': 30, 'brain_name': brain_name, 'number_agents': len(env_info.agents),
}, 'critic_params': { # critic parameters 'norm': True, 'lr': 1e-5, # learning rate 'weight_decay': 5e-8, # weight decay 'state_size': state_size, # size of the state space 'action_size': action_size, # size of the action space 'seed': seedGenerator, # seed of the network architecture 'hidden_layers': [512, 512, 128], # hidden layer neurons 'dropout': 0.05, 'action_layer': 1, # 'act_fn': [F.leaky_relu, F.leaky_relu, lambda x: x] 'act_fn': [nn.ELU(), nn.ELU(), lambda x: x] }, 'noise_params': { # parameters for the noisy process 'mu': 0., # mean 'theta': 0.15, # theta value for the ornstein-uhlenbeck process 'sigma': 0.2, # variance 'seed': seedGenerator, # seed 'action_size': action_size } } } # agents = [DDPGAgent(idx=idx, params=params['agent_params']) for idx, a in enumerate(range(num_agents))] agents = DDPGAgent(idx=0, params=params['agent_params']) scores = train(agents=agents, params=params, num_processes=num_agents) df = pd.DataFrame(data={'episode': np.arange(len(scores)), 'DDPG-3': scores}) df.to_csv('results/DDPG-3-scores.csv', index=False)
def __init__(self, action_size, state_size, n_agents, GAMMA=0.99, MEMORY_SIZE=int(2e4), BATCH_SIZE=256, WARMP_UP=4096, TAU=5e-3, actor_layers=[256, 128], actor_input_bn=True, actor_hidden_bn=False, critic_state_layers=[256], critic_final_layers=[256, 128], critic_state_bn=True, critic_final_bn=False, apply_post_bn=False, noise_scaling_factor=2., noise_scaling_factor_dec=0.9, noise_scaling_min=0.2, LR_ACTOR=1e-4, LR_CRITIC=2e-4, huber_loss=False, DEBUG=False, OUNoise=True, activation='relu', PER=None, min_non_zero_prc=0.35, name='MADDPG', dev=None): self.__name__ = name self.DEBUG = DEBUG self.n_agents = n_agents self.action_size = action_size self.GAMMA = GAMMA self.TAU = 0.02 self.BATCH_SIZE = BATCH_SIZE self.WARM_UP = WARMP_UP self.state_size = state_size self.current_episode = 0 if dev is None: dev = th.device("cuda:0" if th.cuda.is_available() else "cpu") self.dev = dev self.huber_loss = huber_loss if PER == 'none': self.min_non_zero_prc = 0 self.PER = False self.memory = MASimpleReplayBuffer(capacity=MEMORY_SIZE, nr_agents=self.n_agents, engine='torch', device=self.dev, min_non_zero_prc=0) elif PER == "sparsity": self.min_non_zero_prc = min_non_zero_prc self.PER = False self.memory = MASimpleReplayBuffer( capacity=MEMORY_SIZE, nr_agents=self.n_agents, engine='torch', device=self.dev, min_non_zero_prc=min_non_zero_prc) elif PER == 'PER1': self.PER = True self.memory = MASimplePER( capacity=MEMORY_SIZE, nr_agents=self.n_agents, engine='torch', device=self.dev, ) else: raise ValueError("Unknown PER parameter value '{}'".format(PER)) self.agents = [] for i in range(n_agents): self.agents.append( DDPGAgent(a_size=self.action_size, s_size=self.state_size, dev=self.dev, n_agents=self.n_agents, TAU=self.TAU, bn_post=apply_post_bn, actor_layers=actor_layers, actor_input_bn=actor_input_bn, actor_hidden_bn=actor_hidden_bn, critic_state_bn=critic_state_bn, critic_final_bn=critic_final_bn, critic_final_layers=critic_final_layers, critic_state_layers=critic_state_layers, OUnoise=OUNoise, LR_ACTOR=LR_ACTOR, LR_CRITIC=LR_CRITIC, activation=activation, name='{}_Agent_{}'.format(self.__name__, i + 1))) self.agents[0].show_architecture() self.noise_scaling_factor = noise_scaling_factor self.noise_scaling_factor_dec = noise_scaling_factor_dec self.n_updates = 0 self.noise_scaling_min = noise_scaling_min return