Exemple #1
0
def Load_Cfagent(defaults):
    with Load(defaults["load_name"], num=defaults['num']) as load:
        collector, env, mover, teleporter, CFagent = load.items(Collector, Game, Mover, Teleporter, CFAgent)
        buffer = ReplayBuffer(**defaults)
        CFbuffer = CFReplayBuffer(**defaults)

        with Save(env, collector, mover, teleporter, CFagent, **defaults) as save:
            intervention_idx, modified_board = teleporter.pre_process(env)
            dones = CFagent.pre_process(env)
            CF_dones, cfs = None, None
            CFagent.CF_count = 0
            for frame in loop(env, collector, save, teleporter):
                CFagent.counterfact(env, dones, teleporter, CF_dones, cfs)
                modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
                actions = mover(modified_board)
                observations, rewards, dones, info = env.step(actions)
                modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
                buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
                mover.learn(modified_board, actions, modified_rewards, modified_dones)
                board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
                teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
                collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
                CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults)
                CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones)
                CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data()
                CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
Exemple #2
0
    def __init__(self, state_size, action_size, seed, model=QNetwork):
        """Initialize an Agent object.
        
        Param
        =====
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (object): model to use
            
        Return
        ======
            None
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = model(state_size, action_size, seed).to(device)
        self.qnetwork_target = model(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=hyperparameters["lr"])

        # Replay memory
        self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"],
                                   hyperparameters["batch_size"], seed, device)
        # Initialize time step (for updating every hyperparameters["update_every"] steps)
        self.t_step = 0

        # Init tracking of params
        wandb.login()
        wandb.init(project=project_name, name=name, config=hyperparameters)
        jovian.log_hyperparams(hyperparameters)
Exemple #3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters
Exemple #4
0
    def __init__(self, state_size, action_size, agent_id):

        self.state_size  = state_size
        self.action_size = action_size
        self.seed        = args['seed']
        self.device      = args['device']
        #self.args        = args

        # Q-Network
        self.actor_network    = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_target     = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_optimizer  = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR'])
        
        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        #if not agent_id:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #else:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        
        # Replay memory
        self.memory      = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed)
        
        # Noise process
        self.noise       = OUNoise(action_size, self.seed)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step      = 0
        
        self.mCriticLoss = 0
        
        self.actorLoss   = 0
Exemple #5
0
    def get_replay_buffer(self, gamma, env):
        total_score, steps, n = 0, 0, 0
        replay_buffer = ReplayBuffer()
        state = self.state_modifier.apply(env.reset())
        while steps < self.steps:
            self._episodes += 1
            n += 1
            if n == 1:
                print("0 state value {}".format(
                    self.critic.get_values(state).detach()[0]))
            score = 0

            while True:  # timelimits
                if self.render: env.render()
                action = self.actor.get_action(state).detach()
                next_state, reward, done, tl, _ = env.step(action)
                next_state = self.state_modifier.apply(next_state)

                if tl == 1:
                    reward += self.critic.get_values(
                        next_state).detach()[0] * gamma

                score += reward
                replay_buffer.append(state, action, reward, done == 1)

                state = next_state
                total_score, steps = total_score + reward, steps + 1
                if done == 1: break

        print("episodes: {}, score: {}, avg steps: {}, avg reward {}".format(
            self._episodes, total_score / n, steps / n, total_score / steps))
        return replay_buffer, total_score / n
Exemple #6
0
    def train(self,
              transitions: int,
              sigma_max: float = 1.,
              sigma_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              progress_upd_step: int = None,
              start_training: int = 1000,
              shaping_coef: float = 300.):
        history = ReplayBuffer(buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "sigma_max": sigma_max,
            "sigma_min": sigma_min,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()
        t = tqdm(range(transitions))
        for i in t:
            sigma = sigma_max - (sigma_max - sigma_min) * i / transitions
            action = self.act(state)
            noise = np.random.normal(scale=sigma, size=action.shape)
            action = np.clip(action + noise, -1, 1)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                batch = history.sample(batch_size)
                self.update_critic(batch)
                self.update_actor(batch)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001  #TODO move these to configs
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 1e-6
        self.gamma = 0.99
        self.tau = 0.001
        self.buffer_size = 1000000
        self.batch_size = 128
        self.theta = 0.15
        self.ou = 0
        self.sigma = 0.3

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating CRITIC model
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        _, _, self.target_critic_model = critic_.create_critic_model()

        self.critic_grads = tf.gradients(self.critic_model.output,
                                         self.critic_action_input)

        self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim)
        self.noise.reset()

        self.sess.run(tf.initialize_all_variables())
Exemple #8
0
def metateleport(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter1 = Teleporter(env, _extra_dim=1, **defaults)
    teleporter2 = MetaTeleporter(env, **defaults)
    buffer1 = ReplayBuffer(**defaults)
    buffer2 = ReplayBuffer(**defaults)

    with Save(env, collector, mover, teleporter1, teleporter2, **defaults) as save:
        intervention_idx2, modified_board2 = teleporter2.pre_process(env)
        intervention_idx1, _ = teleporter1.pre_process(env)
        for frame in loop(env, collector, save, teleporter1, teleporter2):
            modified_board2 = teleporter2.interveen(env.board, intervention_idx2, modified_board2)
            modified_board1 = teleporter1.interveen(env.board, intervention_idx1, modified_board2)
            actions = mover(modified_board1)
            observations, rewards, dones, info = env.step(actions)
            modified_board1, modified_board2, modified_rewards1, modified_rewards2, modified_dones1, modified_dones2, tele_rewards, intervention_idx1, intervention_idx2 = teleporter2.metamodify(observations, rewards, dones, info, teleporter1.interventions)
            buffer1.teleporter_save_data(teleporter1.boards, modified_board2, teleporter1.interventions, modified_rewards2, modified_dones2, intervention_idx1)
            buffer2.teleporter_save_data(teleporter2.boards, observations, teleporter2.interventions, tele_rewards, dones, intervention_idx2)
            mover.learn(modified_board1, actions, modified_rewards1, modified_dones1)
            board_before, board_after, intervention, tel_rewards, tele_dones = buffer1.sample_data()
            teleporter1.learn(board_after, intervention, tel_rewards, tele_dones, board_before)
            board_before, board_after, intervention, tel_rewards, tele_dones = buffer2.sample_data()
            teleporter2.learn(board_after, intervention, tel_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards1, modified_rewards2, tele_rewards], [dones, modified_dones1, modified_dones2])
Exemple #9
0
    def train(self,
              transitions: int,
              eps_max: float = 0.5,
              eps_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              shaping_coef: float = 300.,
              progress_upd_step: int = None,
              start_training: int = 10000):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
Exemple #10
0
def CFagent(defaults):
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    buffer = ReplayBuffer(**defaults)
    CFagent = CFAgent(env, **defaults)
    CFbuffer = CFReplayBuffer(**defaults)
    collector = Collector(**defaults)

    with Save(env, collector, mover, teleporter, CFagent, **defaults) as save:
        intervention_idx, modified_board = teleporter.pre_process(env)
        dones = CFagent.pre_process(env)
        CF_dones, cfs = None, None
        for frame in loop(env, collector, save, teleporter):
            CFagent.counterfact(env, dones, teleporter, CF_dones, cfs)
            modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
            buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            mover.learn(modified_board, actions, modified_rewards, modified_dones)
            board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
            teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
            CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults)
            CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones)
            CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data()
            CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
Exemple #11
0
    def __init__(self,
                 name,
                 args,
                 sess=None,
                 reuse=False,
                 log_tensorboard=True,
                 save=True):
        self.learn_steps = 0

        # hyperparameters
        self.gamma = args[name]['gamma']
        self.tau = args[name]['tau']
        self.init_noise_sigma = args[name]['init_noise_sigma']
        self.noise_decay = args[name]['noise_decay']

        # replay buffer
        self.buffer = ReplayBuffer(sample_size=args['batch_size'],
                                   max_len=args[name]['buffer_size'])

        super(DDPG, self).__init__(name,
                                   args,
                                   sess=sess,
                                   reuse=reuse,
                                   build_graph=True,
                                   log_tensorboard=log_tensorboard,
                                   save=save)

        self._initialize_target_net()
    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemple #13
0
    def __init__(self, env):

        # Hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 64
        self.BUFFER_SIZE = 20000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           self.TAU, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim, self.TAU,
                             self.CRITIC_LEARNING_RATE)

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []
Exemple #14
0
    def __init__(self,
                 DQNType,
                 input_shape,
                 replaybuffersize=100000,
                 input_preprocess=[]):
        super().__init__(MOVEMENTS.COMPLEX)
        self.memory = ReplayBuffer(replaybuffersize)
        self.train_network = DQNType(input_shape, len(self.movements))
        self.target_network = self.train_network.clone_model()
        self.input_preprocess = input_preprocess

        ## Initialize
        self.counter = 0
        self.epsilon = 1

        ## hyperparameters
        self.hyperparams = {
            "burn_in": 10000,
            "copy_each": 5000,
            "learn_each": 1,
            "save_each": 5000,
            "final_epsilon": 0.1,
            "epsilon_decay_rate": 0.99998,
            "batch_size": 32,
            "gamma": 0.99
        }
Exemple #15
0
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every

        # model settings
        self.qnet_local = Model(state_size, action_size).to(self.device)
        self.qnet_target = Model(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

        # replay buffer settings
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.update_step = 0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 head_name="DuelingDQN",
                 head_scale="max"):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Assign model parameters and assign device
        model_params = [
            state_size, action_size, seed, hidden_layers, head_name, head_scale
        ]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)

        # Set up optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Initialize Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0
Exemple #17
0
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device,
                 random_seed=4,
                 lr_critic=1.0e-4,
                 weight_decay=0.0):
        super(MADDPG, self).__init__()

        # parameter configuration
        self.num_agents = num_agents
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents
        self.global_action_size = global_action_size
        self.global_obs_dim = global_obs_dim
        torch.manual_seed(random_seed)
        random.seed(random_seed)
        self.random_seed = random_seed
        self.weight_decay = weight_decay

        # define actors
        self.actors = [
            DDPGActor(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=device) for _ in range(num_agents)
        ]
        # define centralized critic
        self.critic = Critic(global_obs_dim, global_action_size,
                             self.random_seed).to(self.device)
        self.target_critic = Critic(global_obs_dim, global_action_size,
                                    self.random_seed).to(self.device)
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)

        # noise coef
        self.noise_coef = 1.0
        self.noise_coef_decay = 1e-6

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)
Exemple #20
0
def train_network(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer):
    
    network = storage.latest_network() # recover the latest network to be updated
    learning_rate = config.lr_init * config.lr_decay_rate**(network.training_steps()/config.lr_decay_steps)
    network.optimiser.learning_rate = learning_rate
    
    for i in range(config.training_steps+1):
        
        if i % config.checkpoint_interval == 0:
            storage.save_network(network.training_steps(), network)

        batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps, config.prediction_interval) 

        l = network.update_weights(batch, config.weight_decay, config.hidden_state_dampen)

        if i % 100 == 0:
            print((i, l))
            
    storage.save_network(network.training_steps(), network)
    
    return i

##


        
Exemple #21
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 drop_p=0.3,
                 with_dueling=False,
                 isDDQN=False):
        """Initialize an Agent object.
        
        Params  
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (array): Hidden number of nodes in each layer
            drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout)
            with_dueling (boolean) : If true, network is dueling network, otherwise false.
            isDDQN (boolean) : If true, double dqn in implemented, otherwise false.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       hidden_layers=hidden_layers,
                                       drop_p=drop_p,
                                       dueling=with_dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        hidden_layers=hidden_layers,
                                        drop_p=drop_p,
                                        dueling=with_dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Parameter instance of DDQN.
        self.isDDQN = isDDQN
Exemple #22
0
    def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], tau)
        self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau)

        self.buffer = ReplayBuffer(self.buffer_size)
        self.save_epi_reward = []
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 GAMMA=GAMMA,
                 TAU=TAU,
                 LR=LR,
                 UPDATE_EVERY=UPDATE_EVERY,
                 BUFFER_SIZE=BUFFER_SIZE,
                 BATCH_SIZE=BATCH_SIZE):
        """ Initialize the agent.
        ==========
        PARAMETERS 
        ==========
            state_size (int) = observation dimension of the environment
            action_size (int) = dimension of each action
            seed (int) = random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # instantiate online local and target network for weight updates
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        # create a replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # time steps for updating target network every time t_step % 4 == 0
        self.t_step = 0
Exemple #24
0
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)
Exemple #25
0
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.AE = Actor(state_dim,action_dim).cuda()
        self.CE = Critic(state_dim,action_dim).cuda()
        self.AT = Actor(state_dim,action_dim).cuda()
        self.CT = Critic(state_dim,action_dim).cuda()
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.time_step = 0

        self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl'))
        # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl'))
        # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))
        # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))

        self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4)
        self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4)
Exemple #26
0
 def __init__(self, env, batch_size, mem_size, discount, actor_params,
              critic_params):
     self._batch_size = batch_size
     self._mem_size = mem_size
     self._discount = discount
     self._sess = tensorflow.Session()
     k_backend.set_session(self._sess)
     self._env = env
     self._state_dim = env.observation_space.shape[0]
     self._action_dim = env.action_space.shape[0]
     self._action_min = env.action_space.low
     self._action_max = env.action_space.high
     self._state_min = env.observation_space.low
     self._state_max = env.observation_space.high
     self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                         self._action_min, self._action_max, actor_params)
     self._critic = Critic(self._sess, 0.5, self._state_dim,
                           self._action_dim, critic_params)
     self._memory = ReplayBuffer(mem_size)
Exemple #27
0
    def __init__(self, state_size, action_size, random_seed):
        
                """ Initialize the model with arguments as follows:
                
                    ARGUMENTS
                    =========
                        - state_size (int) = dimension of input space
                        - action_size (int) = dimension of action space
                        - random_seed (int) = random seed

                    Returns 
                    =======
                        - best learned action to take after Actor-Critic Learning
                """
            
                self.state_size = state_size
                self.action_size = action_size
                self.seed = random.seed(random_seed)

                # create noise
                self.noise = OUNoise(action_size, random_seed)
                self.noise_decay = NOISE_DECAY
                
                # create memory
                self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
                


                # Actor Networks (local online net + target net)
                self.actor_local = Actor(state_size, action_size, random_seed).to(device)
                self.actor_target = Actor(state_size, action_size, random_seed).to(device)
                self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR)

                # Critic Networks (local online net + target net)
                self.critic_local = Critic(state_size, action_size, random_seed).to(device)
                self.critic_target = Critic(state_size, action_size, random_seed).to(device)
                self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
                # instantiate online and target networks with same weights
                self.soft_update(self.actor_local, self.actor_target, 1)
                self.soft_update(self.critic_local, self.critic_target, 1)
                
                self.learn_counter = 0
Exemple #28
0
def teleport(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    buffer = ReplayBuffer(**defaults)

    with Save(env, collector, mover, teleporter, **defaults) as save:
        intervention_idx, modified_board = teleporter.pre_process(env)
        for frame in loop(env, collector, save, teleporter):
            modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
            buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            mover.learn(modified_board, actions, modified_rewards, modified_dones)
            board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
            teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
    def __init__(self, action_shape, model_structure, agent_hyperparams,
                 dueling, double):
        self.device = torch.device(agent_hyperparams["device"])
        self.action_shape = action_shape
        self.dueling = dueling
        self.double = double
        if self.dueling:
            prime = model_structure[0]
            value = model_structure[1]
            advantage = model_structure[2]
            self.local_model = DuelingQ(prime, value,
                                        advantage).to(self.device)
            self.target_model = DuelingQ(prime, value,
                                         advantage).to(self.device)
            self.target_model.load_state_dict(self.local_model.state_dict())
        else:
            self.local_model = Model(model_structure).to(self.device)
            self.target_model = Model(model_structure).to(self.device)
            self.target_model.load_state_dict(self.local_model.state_dict())

        self.optimizer = optim.RMSprop(self.local_model.parameters(),
                                       lr=agent_hyperparams['lr'])

        self.replay_buffer = ReplayBuffer(
            agent_hyperparams['memory_size'], agent_hyperparams['batch_size'],
            agent_hyperparams['greedy_coeff'],
            agent_hyperparams['default_priority'],
            agent_hyperparams['shed_amount'])

        self.eps = agent_hyperparams['eps']
        self.alpha = agent_hyperparams['alpha']
        self.gamma = agent_hyperparams['gamma']
        self.beta = agent_hyperparams['beta']

        self.eps_decay = agent_hyperparams['eps_decay']
        self.alpha_decay = agent_hyperparams['alpha_decay']
        self.gamma_decay = agent_hyperparams['gamma_decay']
        self.beta_decay = agent_hyperparams['beta_decay']

        self.min_eps = agent_hyperparams['min_eps']
        self.min_alpha = agent_hyperparams['min_alpha']
        self.min_gamma = agent_hyperparams['min_gamma']
        self.min_beta = agent_hyperparams['min_beta']
class Agent:
    def __init__(self,
                 input_dim,
                 output_dim,
                 tau=0.001,
                 gamma=0.99,
                 train_batch_size=640):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.tau = tau
        self.gamma = gamma
        self.train_batch_size = train_batch_size
        self.main_critic = Critic(input_dim, output_dim, tau, gamma)
        self.target_critic = Critic(input_dim, output_dim, tau, gamma)

        self.main_actor = Actor(input_dim, output_dim, tau, gamma)
        self.target_actor = Actor(input_dim, output_dim, tau, gamma)

        self.target_critic.model.set_weights(
            self.main_critic.model.get_weights())
        self.target_actor.model.set_weights(
            self.main_actor.model.get_weights())

        self.memory = ReplayBuffer(batch_size=train_batch_size)

    def get_action(self, state):
        return self.main_actor.get_action(state)

    def train(self):
        data = self.memory.sample()
        states = np.vstack([e.state for e in data if e is not None])
        actions = np.array([e.action for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, self.output_dim)
        rewards = np.array([e.reward for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in data
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in data if e is not None])

        actions_next = self.target_actor.model.predict_on_batch(next_states)
        Q_targets_next = self.target_critic.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.main_critic.train(states, actions, Q_targets)
        action_gradients = np.reshape(self.main_critic.get_gradient(states,actions), \
                                         (-1, self.output_dim))

        self.main_actor.train(states, action_gradients)

        self.target_actor.model = self.main_actor.soft_update(
            self.target_actor.model)
        self.target_critic.model = self.main_critic.soft_update(
            self.target_critic.model)