Beispiel #1
0
    def __init__(self,
                 action_size=2,
                 seed=0,
                 load_file=None,
                 n_agents=2,
                 buffer_size=int(3e4),
                 batch_size=128,
                 gamma=0.99,
                 update_every=2,
                 noise_start=1.0,
                 noise_decay=1.0,
                 evaluation_only=False):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.evaluation_only = evaluation_only
        # create two agents, each with their own actor and critic
        models = [model.LowDim2x(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(0, models[0], load_file=None),
            DDPG(1, models[1], load_file=None)
        ]
        # create shared replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)
        if load_file:
            for i, save_agent in enumerate(self.agents):
                actor_file = torch.load(load_file + '.' + str(i) +
                                        '.actor.pth',
                                        map_location='cpu')
                critic_file = torch.load(load_file + '.' + str(i) +
                                         '.critic.pth',
                                         map_location='cpu')
                save_agent.actor_local.load_state_dict(actor_file)
                save_agent.actor_target.load_state_dict(actor_file)
                save_agent.critic_local.load_state_dict(critic_file)
                save_agent.critic_target.load_state_dict(critic_file)
            print('Loaded: {}.actor.pth'.format(load_file))
            print('Loaded: {}.critic.pth'.format(load_file))
Beispiel #2
0
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double_dqn=True,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 action_map=None):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr (float): learning rate
            update_every (int): how often to update the network
            use_double_dqn (bool): wheter to use double DQN algorithm
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            action_map (dict): how to map action indexes from model output to gym environment
        """
        random.seed(seed)

        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double_dqn = use_double_dqn
        self.use_prioritized_experience_replay = use_prioritized_experience_replay

        self.loss_list = []  # track loss across steps
        self.entropy_list = []  # track entropy across steps

        # Q-Network
        self.qnetwork_local = model.local
        self.qnetwork_target = model.target

        # DEBUG weight initialization
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #self.qnetwork_local.fc_s.weight.data[0] = torch.tensor([0.0, 0.0, 0.0, 0.0])
        #print(self.qnetwork_local.fc_s.weight.data[0])
        #print(self.qnetwork_target.fc_s.weight.data[0])
        #input('->')

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=.00025, momentum=0.95)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            self.qnetwork_local.load_state_dict(torch.load(load_file + '.pth'))
            self.qnetwork_target.load_state_dict(torch.load(load_file +
                                                            '.pth'))
            #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))

        self.action_map = action_map
Beispiel #3
0
    def __init__(self,
                 model,
                 action_size,
                 seed=0,
                 load_file=None,
                 n_agents=1,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0001,
                 clip_gradients=False,
                 theta=0.15,
                 sigma=0.2,
                 update_every=1,
                 use_prioritized_experience_replay=False,
                 alpha_start=0.5,
                 alpha_decay=0.9992,
                 evaluation_only=False):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            load_file (str): path of checkpoint file to load
            n_agents (int): number of agents to train simultaneously
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            weight_decay (float): L2 weight decay
            clip_gradients (bool): whether to clip gradients on both actor and critic
            theta (float): OU noise parameter
            sigma (float): OU noise parameter
            update_every (int): how often to update the network
            use_prioritized_experience_replay (bool): wheter to use PER algorithm
            alpha_start (float): initial value for alpha, used in PER
            alpha_decay (float): decay rate for alpha, used in PER
            evaluation_only (bool): set to True to disable updating gradients and adding noise
        """
        random.seed(seed)

        self.action_size = action_size
        self.n_agents = n_agents
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.use_prioritized_experience_replay = use_prioritized_experience_replay
        self.clip_gradients = clip_gradients
        self.evaluation_only = evaluation_only

        self.loss_list = []  # track loss across steps

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # DEBUG weight initialization
        #print(self.actor_local.fcs1.weight.data[0])
        #print(self.actor_target.fcs1.weight.data[0])
        #print(self.critic_local.fcs1.weight.data[0])
        #print(self.critic_target.fcs1.weight.data[0])
        #input('->')

        # Noise process
        self.noise = OUNoise((n_agents, action_size),
                             seed,
                             theta=theta,
                             sigma=sigma)

        # Replay memory
        if use_prioritized_experience_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  self.buffer_size,
                                                  self.batch_size, seed)
        else:
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, seed)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # initalize alpha (used in prioritized experience sampling probability)
        self.alpha_start = alpha_start
        self.alpha_decay = alpha_decay
        self.alpha = self.alpha_start

        if load_file:
            if device.type == 'cpu':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth', map_location='cpu'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth', map_location='cpu'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            elif device.type == 'cuda:0':
                self.actor_local.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.actor_target.load_state_dict(
                    torch.load(load_file + '.actor.pth'))
                self.critic_local.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                self.critic_target.load_state_dict(
                    torch.load(load_file + '.critic.pth'))
                #self.memory = dill.load(open(load_file + '.buffer.pck','rb'))
            print('Loaded: {}'.format(load_file))