コード例 #1
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):
        # self.cuda = USE_CUDA #args.cuda
        self.cuda = args.cuda

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        #Init models
        #actor_kwargs = {'n_inp':self.nb_states, 'n_feature_list':[args.hidden1,args.hidden2], 'n_class':self.nb_actions}
        #self.actor = MLP(**actor_kwargs)
        #self.actor_target = MLP(**actor_kwargs)
        #self.critic = MLP(**actor_kwargs)  #TODO: actor and critic has same structure for now.
        #self.critic_target = MLP(**actor_kwargs)

        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)

        self.criterion = nn.MSELoss()
        if self.cuda:
            self.actor = self.actor.cuda(
            )  # torch.nn.DataParallel(self.model).cuda()  #TODO dataparallel not working
            self.critic = self.critic.cuda()
            self.actor_target = self.actor_target.cuda()
            self.critic_target = self.critic_target.cuda()
            self.criterion = self.criterion.cuda()

        # Set optimizer
        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=args.prate)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=args.rate)
        # Loss function
        self.loss_fn = torch.nn.MSELoss(size_average=False)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = self.criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_wts(self, modelfile):
        if os.path.isfile(modelfile + 'model.pth.tar'):
            checkpoint = torch.load(modelfile)
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.actor_optim.load_state_dict(checkpoint['actor_optim'])
            self.critic_optim.load_state_dict(checkpoint['critic_optim'])
            return checkpoint['step']
        else:
            return 0

    def save_wts(self, savefile, step):
        saveme = {  #TODO save other stuff too, like epoch etc
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optim': self.actor_optim.state_dict(),
            'critic_optim': self.critic_optim.state_dict(),
            'step': step
        }
        torch.save(saveme, savefile + 'model.pth.tar')
コード例 #2
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 device='cpu',
                 params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0],
                                 params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0],
                                  params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, params['seed'],
                                   params['critic_units'][0],
                                   params['critic_units'][1]).to(device)
        self.critic_target = Critic(state_size, action_size, params['seed'],
                                    params['critic_units'][0],
                                    params['critic_units'][1]).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size,
                             params['seed'],
                             theta=params['noise_theta'],
                             sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_weights(self, filenames):
        """Store weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to store weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        torch.save(self.actor_local.state_dict(), filenames[0])
        torch.save(self.critic_local.state_dict(), filenames[1])

    def load_weights(self, filenames):
        """Load weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to load weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        self.actor_local.load_state_dict(torch.load(filenames[0]))
        self.critic_local.load_state_dict(torch.load(filenames[1]))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(
                self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #3
0
ファイル: ddpg.py プロジェクト: houyanxu/RL_alg
class DDPGPolicy(object):
	def __init__(self,env_name, policy_config,device = 'cpu'):
		self.device = device
		self.env = gym.make(env_name) #仅仅用于设置observation
		self.obs_dim = self.env.observation_space.shape[0]
		if isinstance(self.env.action_space, gym.spaces.Box):
			self.action_dim = self.env.action_space.shape[0]
		elif isinstance(self.env.action_space, gym.spaces.Discrete):
			raise TypeError('Unsupported action type')
		else:
			raise ValueError('unsupport action ', type(self.action_dim))

		self.action_limit = self.env.action_space.high[0]
		self.lr = policy_config['lr']
		self.actor = Actor(self.obs_dim, self.action_dim).to(device)
		self.critic = Critic(self.obs_dim, self.action_dim).to(device)
		self.actor_target = deepcopy(self.actor)
		self.critic_target = deepcopy(self.critic)

		hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
		hard_update(self.critic_target, self.critic)

		self.actor_optim =  torch.optim.Adam(params=self.actor.parameters(), lr=0.001)
		self.critic_optim =  torch.optim.Adam(params=self.critic.parameters(), lr=0.001)
		self.discount_factor = policy_config['discount_factor']
		self.tau = 0.005


	def train_on_batch(self,rollouts_batch):
		# loss = r+q(st) - q (st+1), minimums loss
		obs, acs, next_obs, dones, r, un_r, summed_r = convert_listofrollouts(paths=rollouts_batch)
		acs = torch.tensor(acs).float().to(self.device)
		obs = torch.FloatTensor(obs).to(self.device)
		next_obs = torch.FloatTensor(next_obs).to(self.device)
		#acs_one_hot = torch.eye(2).to(self.device).index_select(0,acs)# to one hot discrete action space
		dones = torch.IntTensor(dones).to(self.device)
		r = torch.FloatTensor(r).to(self.device)

		# update critic
		self.critic_optim.zero_grad()
		act_target = self.actor_target(next_obs).to(self.device)
		q_target = r + self.discount_factor * self.critic_target(next_obs,act_target)* (1-dones)
		q_pred = self.critic(obs,acs)
		critic_loss = torch.nn.functional.mse_loss(q_pred,q_target)
		critic_loss.backward()
		self.critic_optim.step()

		#update actor
		self.actor_optim.zero_grad()
		actor_loss = -torch.mean(self.critic(obs,self.actor(obs)))
		actor_loss.backward()
		self.actor_optim.step()

		info = {'loss': actor_loss.cpu().detach().numpy(),  # scale
				'model_out': q_target,  # torch.tensor [sum(batch), ac_dim],
				}
		return info

	def update_target_network(self):
		soft_update(self.actor_target, self.actor, self.tau)
		soft_update(self.critic_target, self.critic, self.tau)

	def get_weights(self):
		#TODO: actor and critic parameters
		return {k:v for k,v in self.actor.state_dict().items()}

	def set_weights(self,weights):
		self.actor.load_state_dict(weights)

	def compute_actions(self, obs, noise_scale): #通过noise来判断是否需要增加噪声,如果是在eval中noise为0
		obs = obs.to(self.device)
		actions = self.actor(obs).cpu().detach().numpy()
		actions += noise_scale * np.random.rand(self.action_dim)
		actions = np.clip(actions, -self.action_limit, self.action_limit)

		return actions

	def reset(self):#在env.reset的同时需要reset random_process
		self.random_process.reset_states()
コード例 #4
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def train(self, env, n_episodes=2000, max_t=1000):
        scores_deque = deque(maxlen=100)
        scores = []
        for i_episode in range(1, n_episodes + 1):
            state = env.reset()
            self.reset()
            score = 0
            for t in range(max_t):
                action = self.act(state, add_noise=True)
                choice = np.argmax(action)
                next_state, reward, done, _ = env.step(choice)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_deque.append(score)
            scores.append(score)
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
                i_episode, np.mean(scores_deque), score),
                  end="")
            if i_episode % 1 == 0:
                torch.save(self.actor_local.state_dict(),
                           'checkpoint_actor.pth')
                torch.save(self.critic_local.state_dict(),
                           'checkpoint_critic.pth')
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_deque)))
        return scores