예제 #1
0
def loadagent(ckp_name, *args, **kargs):
    agent = DDPGAgent(*args, **kargs)
    agent.path = ckp_name
    actor_state_dict, critic_state_dict = torch.load(ckp_name)
    agent.actor_local.load_state_dict(actor_state_dict)
    agent.actor_target.load_state_dict(actor_state_dict)
    agent.critic_local.load_state_dict(critic_state_dict)
    agent.critic_target.load_state_dict(critic_state_dict)
    agent.lr_actor *= agent.lr_decay
    agent.lr_critic *= agent.lr_decay
    return agent
예제 #2
0
    def __init__(self, state_size, action_size, seed, discount_factor=GAMMA, tau=TAU):
        super(MADDPG, self).__init__()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # critic input = obs_full + actions = 14+2+2+2=20
        self.maddpg_agent = [
            DDPGAgent(state_size, action_size, seed), 
            DDPGAgent(state_size, action_size, seed)
        ]
        
        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.t_step = 0
예제 #3
0
 def __init__(self,
              env,
              num_agents,
              alpha,
              beta,
              tau,
              input_dims,
              n_actions,
              hd1_dims=400,
              hd2_dims=300,
              mem_size=1000000,
              gamma=0.99,
              batch_size=64):
     self.env = env
     self.num_agents = int(num_agents)
     self.alpha = alpha
     self.beta = beta
     self.gamma = gamma
     self.tau = tau
     self.batch_size = batch_size
     self.agents = [
         DDPGAgent(alpha=self.alpha,
                   beta=self.beta,
                   tau=self.tau,
                   input_dims=input_dims,
                   n_actions=n_actions,
                   hd1_dims=hd1_dims,
                   hd2_dims=hd2_dims,
                   mem_size=mem_size,
                   gamma=self.gamma,
                   batch_size=self.batch_size,
                   agent_no=i) for i in range(self.num_agents)
     ]
     self.agents_states = []
     self.local_agent_states = [[] for i in range(self.num_agents)]
예제 #4
0
    def __init__(self, state_size, action_size, memory, num_agents, config):
        super(DDPGMultiAgent, self).__init__()

        self.commonMemory = memory  # Replay memory

        #Hyper Params

        self.random_seed = config["SEED"]
        self.gamma = config["GAMMA"]
        self.tau = config["TAU"]
        self.lrActor = config["LR_ACTOR"]
        self.lrCritic = config["LR_CRITIC"]

        self.mu = config["MU"]
        self.theta = config["THETA"]
        self.sigma = config["SIGMA"]
        self.isNNHardcopy = False
        self.explorfactor = config["EXPLORE"]
        self.micro_batch_size = config["BATCH_SIZE"]

        self.num_agents = num_agents
        self.action_size = action_size
        self.state_size = state_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 self.random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  self.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lrActor)

        # Critic Network (w/ Target Network)
        #self.critic_local = Critic(state_size, action_size, self.random_seed).to(device)
        #self.critic_target = Critic(state_size, action_size, self.random_seed).to(device)
        #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lrCritic, weight_decay=0.)

        #self.multiagent = [DDPGAgent (state_size,action_size,self.actor_local, self.actor_target,self.actor_optimizer,self.critic_local,self.critic_target,self.critic_optimizer,p_gamma=self.gamma,p_lr_actor= self.lrActor,p_lr_critic= self.lrCritic,p_seed=self.p_seed,p_mu=self.mu,p_theta=self.theta, p_sigma=self.sigma,p_tau=self.tau ,p_targetcopy=self.isNNHardcopy,p_explore=self.explorfactor) for agent in range(num_agents)]
        #Actor is Common for Both Agents and Target for each Agent
        self.multiagent = [
            DDPGAgent(state_size, action_size, self.actor_local,
                      self.actor_target, self.actor_optimizer, config)
            for agent in range(num_agents)
        ]

        # Noise process
        self.noise = OUNoise(action_size, self.random_seed, self.mu,
                             self.theta, self.sigma)

        #Trained mode = true , copy local NN weights to target NN
        if (self.isNNHardcopy):
            self.hard_update(self.actor_local, self.actor_target)
            self.hard_update(self.critic_local, self.critic_target)
        print(
            "HP :Gamma :{} , TAU:{}, LR_Act :{} , LR_Critic {} , Mu {}, Theta {}, Sigma {}, ExploreFactor {}, IsTargetHardcopy{}"
            .format(self.gamma, self.tau, self.lrActor, self.lrCritic, self.mu,
                    self.theta, self.sigma, self.explorfactor,
                    self.isNNHardcopy))
예제 #5
0
def play():
    env = UnityEnvironment(file_name='./Reacher.app')

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))

    # create agent
    agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0)

    # load weights
    agent.policy_local.load_state_dict(torch.load('policy.pth'))

    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    state = env_info.vector_observations[0]                # get the current state (for each agent)
    score = 0                                              # initialize the score (for each agent)
    while True:
        action = agent.act(state, add_noise=False)         # select an action (for each agent)
        env_info = env.step(action)[brain_name]            # send all actions to tne environment
        next_state = env_info.vector_observations[0]       # get next state (for each agent)
        reward = env_info.rewards[0]                       # get reward (for each agent)
        done = env_info.local_done[0]                      # see if episode finished
        score += reward                                    # update the score (for each agent)
        state = next_state                                 # roll over states to next time step
        if done:                                           # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(score))

    env.close()
    def __init__(self, config):
        self.config = config

        if config.shared_replay_buffer:
            self.memory = config.memory_fn()
            self.config.memory = self.memory

        self.ddpg_agents = [
            DDPGAgent(self.config) for _ in range(config.num_agents)
        ]

        self.t_step = 0
예제 #7
0
 def __init__(self):
     self.env = make_env(scenario_name='scenarios/new_env')#'simple_spread')
     self.num_agents = self.env.n
     self.agents = [DDPGAgent(self.env,
                              agent_id,
                              actor_lr=0.0,
                              critic_lr=0.0,
                              gamma=1.0) for agent_id in range(self.num_agents)]
     for agent in self.agents:
         #agent.actor.load_state_dict(torch.load('./saved_weights/actor_3000.weights', map_location=torch.device('cpu')))
         #agent.critic.load_state_dict(torch.load('./saved_weights/critic_3000.weights', map_location=torch.device('cpu')))
         pass
     self.reset()
예제 #8
0
def train(train_env_id: str,
          eval_env_id: str,
          logdir: str,
          cfg: ExperimentConfig,
          save_path: str,
          pretrain_path: Optional[str] = None) -> DDPGAgent:
    pretrain = torch.load(os.path.join(pretrain_path)) \
               if pretrain_path is not None            \
               else None
    env = set_env_metadata(train_env_id, cfg)
    train_env = make_vec_env(train_env_id,
                             num_envs=cfg.episodes_per_cycle,
                             no_timeout=True,
                             seed=cfg.seed)
    eval_env = make_vec_env(eval_env_id,
                            num_envs=cfg.num_eval_envs,
                            no_timeout=True,
                            seed=cfg.seed + 100)
    replay = HERReplayBuffer(cfg=cfg)
    tf_logger = TensorboardLogger(logdir)
    actor = ActorNet(obs_dim=cfg.obs_dim,
                     goal_dim=cfg.goal_dim,
                     action_dim=cfg.action_dim,
                     action_range=cfg.action_range,
                     zero_last=(pretrain_path is not None))
    critic = CriticNet(obs_dim=cfg.obs_dim,
                       goal_dim=cfg.goal_dim,
                       action_dim=cfg.action_dim,
                       action_range=cfg.action_range)
    normalizer = Normalizer(cfg.obs_dim+cfg.goal_dim) \
                 if pretrain is None                  \
                 else pretrain.normalizer
    agent = DDPGAgent(cfg=cfg,
                      actor=actor,
                      critic=critic,
                      normalizer=normalizer,
                      reward_fn=env.compute_reward,
                      pretrain=getattr(pretrain, 'actor', None))
    engine = DDPGEngine(cfg=cfg,
                        agent=agent,
                        train_env=train_env,
                        eval_env=eval_env,
                        replay=replay,
                        tf_logger=tf_logger)
    engine.train()

    env.close()
    train_env.close()
    eval_env.close()
    torch.save(agent, os.path.join(save_path))
    return agent
 def __init__(self, arg, memory):
     """
     Args:
         param1: (arg) command line arguments parameter
         param2: (ReplayBuffer) saves experience
     """
     self.memory = memory
     self.discount = arg.discount
     self.batch_size = arg.batch_size
     self.update_every = arg.update_every
     self.ddpg_agents = [
         DDPGAgent(arg, memory) for _ in range(arg.num_agents)
     ]
     self.t_step = 0
예제 #10
0
def main(args):
    with open(args.param, "r") as f:
        config = json.load(f)
    config["locexp"] = args.locexp
    path = args.locexp
    # experiment_name = args.experiment_name
    vid_path = os.path.join(path, "videos-{}".format(args.seed))
    if not os.path.exists(vid_path):
        os.makedirs(vid_path)

    res_path = os.path.join(path, "results")
    if not os.path.exists(res_path):
        os.makedirs(res_path)
    config["vid_path"] = vid_path
    config["res_path"] = res_path
    config["seed"] = args.seed
    env = gym.make("LunarLanderContinuous-v2")
    config["max_action"] = env.action_space.high[0]
    config["min_action"] = env.action_space.low[0]
    print(str(config))
    action_size = env.action_space.shape[0]
    state_size = env.observation_space.shape[0]
    agent = DDPGAgent(action_size=action_size, state_size=state_size, config=config)
    agent.train_agent()
예제 #11
0
def testAgent():
    print("Testing the Agent")
    agent = DDPGAgent(state_size=state_size,
                      action_size=action_size,
                      n_agents=n_agents,
                      seed=0,
                      pretrainedWeightsFile='checkpoint_actor.pth',
                      train=False)
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state
    score = np.zeros(n_agents)  # initialize the score
    while True:
        actions = agent.act(states)  # select an action
        env_info = env.step(actions)[
            brain_name]  # send the action to the environment
        next_states = env_info.vector_observations  # get the next state
        rewards = env_info.rewards  # get the reward
        dones = env_info.local_done  # see if episode has finished
        score += np.array(rewards)  # update the score
        states = next_states  # roll over the state to next time step
        if np.any(dones):  # exit loop if episode finished
            break
    print("Score: {}".format(np.mean(score)))
    return score
    def __init__(self, action_size, state_size, shared_replay_buffer,
                 num_agents):
        self.shared_replay_buffer = shared_replay_buffer
        memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE,
                                         SEED, DEVICE)

        memory = None
        if shared_replay_buffer:
            self.memory = memory_fn()
            memory = self.memory

        self.ddpg_agents = [
            DDPGAgent(action_size, state_size, shared_replay_buffer, memory)
            for _ in range(num_agents)
        ]
        self.t_step = 0
예제 #13
0
def main():
    parser = argparse.ArgumentParser(
        description="Run Extended Q-Learning with given config")
    parser.add_argument(
        "-c",
        "--config",
        type=str,
        metavar="",
        required=True,
        help="Config file name - file must be available as .json in ./configs")

    args = parser.parse_args()

    # load config files
    with open(os.path.join(".", "configs", args.config), "r") as read_file:
        config = json.load(read_file)

    env = UnityEnvironment(file_name=os.path.join(*config["env_path"]))

    noise = OrnsteinUhlenbeckNoise(config["n_actions"], config["mu"],
                                   config["theta"], config["sigma"],
                                   config["seed"])
    replay_buffer = ReplayBuffer(config["buffer_size"], config["device"],
                                 config["seed"])

    agent = DDPGAgent(config, noise, replay_buffer)

    if config["run_training"]:
        session.train(agent, env, config)
        checkpoint_dir = os.path.join(".", *config["checkpoint_dir"],
                                      config["env_name"])
        utils.save_state_dict(os.path.join(checkpoint_dir, "actor"),
                              agent.actor.state_dict())
        utils.save_state_dict(os.path.join(checkpoint_dir, "critic"),
                              agent.critic.state_dict())
    else:
        checkpoint_dir = os.path.join(".", *config["checkpoint_dir"],
                                      config["env_name"])
        agent.actor.load_state_dict(
            utils.load_latest_available_state_dict(
                os.path.join(checkpoint_dir, "actor", "*")))
        agent.critic.load_state_dict(
            utils.load_latest_available_state_dict(
                os.path.join(checkpoint_dir, "critic", "*")))
        session.evaluate(agent, env, num_test_runs=1)

    env.close()
예제 #14
0
def ddpg_run(episodes=1000, seed=42):
    env = start_env()
    env_info = reset_env_info(env)
    
    state_size = get_state_size(env_info)
    action_size = get_action_size(env)
    
    print('Seed used:', seed)
    total_agents = get_total_agents(env_info)
    agent = DDPGAgent(total_agents, state_size, action_size, seed)
    
    scores = []
    scores_window = deque(maxlen=100)
    for episode in range(1, episodes+1):
        init_time = datetime.datetime.now()
        
        env_info = reset_env_info(env)
        score = np.zeros(total_agents)
        dones = np.zeros(total_agents)
        agent.reset()
        critic_losses = []
        actor_losses = []
        while not np.any(dones):
            states = env_info.vector_observations
            actions = agent.act(states, add_noise=True)
            env_info = env_step(env, actions)
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            critic_loss, actor_loss = agent.step(states, actions, rewards, next_states, dones)
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)
            #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='')
                
            score += rewards

        scores_window.append(np.mean(score))
        scores.append(np.mean(score))
        print('Ep. {}/{} - Avg Global Score: {:.2f} - Avg Ep. Score: {:.2f} - Min Ep. Score: {:.2f} - Max Ep. Score: {:.2f} - Actor loss: {:.6f}, Critic loss: {:.6f} - time: {}'.format(episode, episodes,
            np.mean(scores_window), np.mean(score), np.min(score), np.max(score), np.mean(actor_losses), np.mean(critic_losses), datetime.datetime.now() - init_time, end=' '))
            
        if np.mean(scores_window) >= 30.0 and episode >= 100:
            print('\nEnvironment solved (mean of 30.0 for 100 episodes) in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window)))
            
            torch.save(agent.actor_local.state_dict(), 'actor_local_checkpoint.pth')
            torch.save(agent.actor_target.state_dict(), 'actor_target_checkpoint.pth')
            
            torch.save(agent.critic_local.state_dict(), 'critic_local_checkpoint.pth')
            torch.save(agent.critic_target.state_dict(), 'critic_target_checkpoint.pth')
            break
    
    env.close()
    return scores
예제 #15
0
 def __init__(self):
     super(MADDPG, self).__init__()
     self.env = make_env(scenario_name='simple_spread')
     self.num_agents = self.env.n
     self.replay_buffer = MultiAgentReplayBuffer(self.num_agents,
                                                 cfg.buffer_maxlen)
     self.agents = [
         DDPGAgent(self.env,
                   agent_id,
                   actor_lr=cfg.actor_lr,
                   critic_lr=cfg.critic_lr,
                   gamma=cfg.gamma) for agent_id in range(self.num_agents)
     ]
     self.episode_rewards = list()
     self.episode = 0
     self.episode_reward = 0
     self.populate(cfg.warm_start_steps)
     self.states = self.env.reset()
     self.reset()
     if not os.path.exists(os.path.join(os.getcwd(), 'saved_weights')):
         os.mkdir(os.path.join(os.getcwd(), 'saved_weights'))
예제 #16
0
    def __init__(self, state_size,action_size,memory,num_agents,seed=1,p_gamma=0.917,p_tau=0.001,p_lrAct=0.0001,p_lrCritic=0.0002,theta=0.17,sigma=0.24):
        super(DDPGMultiAgent,self).__init__()
      
        self.multiagent = [DDPGAgent (state_size,action_size,random_seed=seed,p_theta=theta, p_sigma=sigma) for agent in range(num_agents)]
        print ("Gamma :{} , LR_Act :{} , LR_Critic {} , Theta {}, Sigma {}".format(p_gamma,p_lrAct,p_lrCritic,theta,sigma))
        self.commonMemory =memory   # Replay memory

        #Hyper Params
        self.gamma = p_gamma
        self.random_seed =seed
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, self.random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, self.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, self.random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, self.random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, self.random_seed)
예제 #17
0
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # create agent
    agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0)

    # do training
    scores, avg_scores = train(env, agent)

    env.close()

    # plot results
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores, label='score')
    plt.plot(np.arange(len(avg_scores)), avg_scores, c='r', label='avg score')
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.legend(loc='upper left')
    plt.savefig('scores_20.png')
예제 #18
0
from agent import DDPGAgent, DDPGArgs
from trainer import CentralizedTrainer
from numeric_env import MultiEnv

import torch
torch.set_num_threads(1)

env = MultiEnv(2, 2)
args = DDPGArgs(state_dim=env.STATE_DIM, action_dim=4)

agent = DDPGAgent(args)
trainer = CentralizedTrainer(agent, env, log_dir='../logs/ddpg_c')
trainer.train(1000000)
예제 #19
0
 def __init__(self, env, buffer_maxlen):
     self.env = env
     self.num_agents = env.n
     self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, buffer_maxlen)
     self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]
예제 #20
0
        dones = env_info.local_done  # see if episode has finished
        score += np.array(rewards)  # update the score
        states = next_states  # roll over the state to next time step
        if np.any(dones):  # exit loop if episode finished
            break
    print("Score: {}".format(np.mean(score)))
    return score


def plotScores(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()


if args.mode == 'train':
    agent = DDPGAgent(state_size=state_size,
                      action_size=action_size,
                      n_agents=n_agents,
                      seed=48)
    scores = trainAgent(agent, n_episodes=1000)
    plotScores(scores)
elif args.mode == 'test':
    testAgent()
else:
    print("Invalid Mode")
        'critic_params': {  # critic parameters
            'norm': True,
            'lr': 0.001,  # learning rate
            'weight_decay': 0.0,  # weight decay
            'state_size': state_size,  # size of the state space
            'action_size': action_size,  # size of the action space
            'seed': seedGenerator,  # seed of the network architecture
            'hidden_layers': [512, 512, 128],  # hidden layer neurons
            'dropout': 0.05,
            'action_layer': 1,
            'act_fn': [F.leaky_relu, F.leaky_relu, lambda x: x]
        },
        'noise_params': {  # parameters for the noisy process
            'mu': 0.,  # mean
            'theta': 0.15,  # theta value for the ornstein-uhlenbeck process
            'sigma': 0.2,  # variance
            'seed': seedGenerator,  # seed
            'action_size': action_size
        }
    }
}

agents = [
    DDPGAgent(idx=idx, params=params['agent_params'])
    for idx, a in enumerate(range(num_agents))
]

scores = train(agents=agents, params=params)

df = pd.DataFrame(data={'episode': np.arange(len(scores)), 'DDPG-3': scores})
df.to_csv('results/DDPG-3-scores.csv', index=False)
import torch
from torch import nn
from torchviz import make_dot, make_dot_from_trace
from graphviz import Digraph
from agent import DDPGAgent


x = torch.randn(5,24).cuda() 
y = torch.randn(5,2).cuda() 

agent = DDPGAgent(state_dim=24, action_dim=2)

dot = make_dot(agent.critic_local(x,y), params=dict(agent.critic_local.named_parameters()))
dot.format = 'png'
dot.render("static/ddpg_critic_model")

dot = make_dot(agent.actor_local(x), params=dict(agent.actor_local.named_parameters()))
dot.format = 'png'
dot.render("static/ddpg_actor_model")
예제 #23
0
from agent import DDPGAgent, DDPGArgs
from trainer import DistributedTrainer
from numeric_env import MultiEnv

import torch
torch.set_num_threads(1)

env = MultiEnv(2, 2)
args = DDPGArgs(state_dim=env.STATE_DIM, action_dim=2)

agents = [DDPGAgent(args) for _ in range(2)]
trainer = DistributedTrainer(agents,
                             env,
                             parameter_share=False,
                             log_dir='../logs/ddpg_d')
trainer.train(1000000)
예제 #24
0
import gym
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from agent import DDPGAgent

env = gym.make('BipedalWalker-v2')
env.seed(0)
agent = DDPGAgent(env.observation_space.shape[0], env.action_space.shape[0], 0)

n_episodes = 2000
t_max = 1000


def train(n_episodes=2000, t_max=1000):
    score_deque = deque(maxlen=100)
    scores = []
    for ep in range(1, n_episodes + 1):
        state = env.reset()
        agent.reset()
        score = 0
        for step in range(t_max):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action[0])
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
예제 #25
0
# install env to the running params

configs = {
    'args': args,
    'env': env,
    'gamma': 0.99,
    'actor_lr': 0.001,
    'critic_lr': 0.01,
    'tau': 0.02,
    'capacity': 10000,
    'batch_size': 32,
    'using_cuda': args.cuda > 0,
}

agent = DDPGAgent(**configs)
# agent.show_model()

if args.RUNNING_TYPE == "train":
    trainer = Trainer(agent, env, configs)
    trainer.train()
elif args.RUNNING_TYPE == "retrain":
    episode, step = agent.load_checkpoint(
        args.CHECKPOINT_DIR, args.CHECKPOINT_START_EPISODE)
    trainer = Trainer(agent, env, configs)
    trainer.train(episode, step)
elif args.RUNNING_TYPE == "test":
    tester = Tester(agent, env, './running_log/model')
    tester.test(True)
else:
    print("unknown running type: ", args.RUNNING_TYPE)
예제 #26
0
 def __init__(self, state_size, action_size, num_agents):
     super(DDPGMultiAgent, self).__init__()
     self.gamma = 0.997
     self.multiagent = [
         DDPGAgent(state_size, action_size) for agent in range(num_agents)
     ]
예제 #27
0
    # Note: Could be extended to a cli (argparse)
    agent_config = {
        'seed': 1,
        'batch_size': 128,
        'memory_size': int(1e5),
        'gamma': 0.99,
        'tau': 1e-3,
        'actor_lr': 1e-3,
        'critic_lr': 1e-3,
        'update_every': 20,
        'update_iterations': 10,
        'noise_decay': 0.999,
        'number_agents': len(env_info.agents)
    }
    agent = DDPGAgent(state_size, action_size, agent_config)

    root_path = 'weights'
    training_config = {
        'nepisodes':
        500,
        'nsteps':
        1000,
        'average_over_episodes':
        100,
        'target_score':
        30,
        'brain_name':
        brain_name,
        'number_agents':
        len(env_info.agents),
예제 #28
0
        },
        'critic_params': {               # critic parameters
            'norm': True,
            'lr': 1e-5,                # learning rate
            'weight_decay': 5e-8,          # weight decay
            'state_size': state_size,    # size of the state space
            'action_size': action_size,  # size of the action space
            'seed': seedGenerator,               # seed of the network architecture
            'hidden_layers': [512, 512, 128], # hidden layer neurons
            'dropout': 0.05,
            'action_layer': 1,
            # 'act_fn': [F.leaky_relu, F.leaky_relu, lambda x: x]
            'act_fn': [nn.ELU(), nn.ELU(), lambda x: x]
        },
        'noise_params': {            # parameters for the noisy process
            'mu': 0.,                # mean
            'theta': 0.15,           # theta value for the ornstein-uhlenbeck process
            'sigma': 0.2,            # variance
            'seed': seedGenerator,         # seed
            'action_size': action_size
        }
    }
}

# agents = [DDPGAgent(idx=idx, params=params['agent_params']) for idx, a in enumerate(range(num_agents))]
agents = DDPGAgent(idx=0, params=params['agent_params']) 

scores = train(agents=agents, params=params, num_processes=num_agents)

df = pd.DataFrame(data={'episode': np.arange(len(scores)), 'DDPG-3': scores})
df.to_csv('results/DDPG-3-scores.csv', index=False)
예제 #29
0
    def __init__(self,
                 action_size,
                 state_size,
                 n_agents,
                 GAMMA=0.99,
                 MEMORY_SIZE=int(2e4),
                 BATCH_SIZE=256,
                 WARMP_UP=4096,
                 TAU=5e-3,
                 actor_layers=[256, 128],
                 actor_input_bn=True,
                 actor_hidden_bn=False,
                 critic_state_layers=[256],
                 critic_final_layers=[256, 128],
                 critic_state_bn=True,
                 critic_final_bn=False,
                 apply_post_bn=False,
                 noise_scaling_factor=2.,
                 noise_scaling_factor_dec=0.9,
                 noise_scaling_min=0.2,
                 LR_ACTOR=1e-4,
                 LR_CRITIC=2e-4,
                 huber_loss=False,
                 DEBUG=False,
                 OUNoise=True,
                 activation='relu',
                 PER=None,
                 min_non_zero_prc=0.35,
                 name='MADDPG',
                 dev=None):
        self.__name__ = name
        self.DEBUG = DEBUG
        self.n_agents = n_agents
        self.action_size = action_size
        self.GAMMA = GAMMA
        self.TAU = 0.02
        self.BATCH_SIZE = BATCH_SIZE
        self.WARM_UP = WARMP_UP
        self.state_size = state_size
        self.current_episode = 0
        if dev is None:
            dev = th.device("cuda:0" if th.cuda.is_available() else "cpu")
        self.dev = dev
        self.huber_loss = huber_loss

        if PER == 'none':
            self.min_non_zero_prc = 0
            self.PER = False
            self.memory = MASimpleReplayBuffer(capacity=MEMORY_SIZE,
                                               nr_agents=self.n_agents,
                                               engine='torch',
                                               device=self.dev,
                                               min_non_zero_prc=0)
        elif PER == "sparsity":
            self.min_non_zero_prc = min_non_zero_prc
            self.PER = False
            self.memory = MASimpleReplayBuffer(
                capacity=MEMORY_SIZE,
                nr_agents=self.n_agents,
                engine='torch',
                device=self.dev,
                min_non_zero_prc=min_non_zero_prc)
        elif PER == 'PER1':
            self.PER = True
            self.memory = MASimplePER(
                capacity=MEMORY_SIZE,
                nr_agents=self.n_agents,
                engine='torch',
                device=self.dev,
            )
        else:
            raise ValueError("Unknown PER parameter value '{}'".format(PER))

        self.agents = []
        for i in range(n_agents):
            self.agents.append(
                DDPGAgent(a_size=self.action_size,
                          s_size=self.state_size,
                          dev=self.dev,
                          n_agents=self.n_agents,
                          TAU=self.TAU,
                          bn_post=apply_post_bn,
                          actor_layers=actor_layers,
                          actor_input_bn=actor_input_bn,
                          actor_hidden_bn=actor_hidden_bn,
                          critic_state_bn=critic_state_bn,
                          critic_final_bn=critic_final_bn,
                          critic_final_layers=critic_final_layers,
                          critic_state_layers=critic_state_layers,
                          OUnoise=OUNoise,
                          LR_ACTOR=LR_ACTOR,
                          LR_CRITIC=LR_CRITIC,
                          activation=activation,
                          name='{}_Agent_{}'.format(self.__name__, i + 1)))

        self.agents[0].show_architecture()

        self.noise_scaling_factor = noise_scaling_factor
        self.noise_scaling_factor_dec = noise_scaling_factor_dec
        self.n_updates = 0
        self.noise_scaling_min = noise_scaling_min
        return