Example #1
0
class DDPG:
    def __init__(self,
                 gamma,
                 memory,
                 s,
                 a,
                 tau,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.memory = ReplayMemory(memory)
        self.actor = Actor(state=s, actions=a)
        self.critic = Critic(state=s, actions=a)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))
        self.targetActor = Actor(state=s, actions=a)
        self.targetActor.load_state_dict(self.actor.state_dict())
        self.targetCritic = Critic(state=s, actions=a)
        self.targetCritic.load_state_dict(self.critic.state_dict())
        self.tau = tau

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.OUarray = np.zeros((1000, self.action), dtype="f")
        self.step = 0

    def processNoise(self):
        #this should be something more eloquent....
        ret = torch.rand(self.action)
        for i in range(0, self.action):
            r = random.random()
            if (r <= .33):
                ret[i] = ret[i]
            elif (.33 < r and r <= .66):
                ret[i] = 0
            else:
                ret[i] = -ret[i]
        return ret

    def OUprocess(self, sigma, theta, mu):
        # define model parameters
        t_0 = 0
        t_end = 10
        length = 1000

        y = np.zeros((length, self.action), dtype="f")
        t = np.linspace(t_0, t_end, length)  # define time axis
        dt = np.mean(np.diff(t))
        drift = lambda y, t: theta * (mu - y)  # define drift term
        diffusion = lambda y, t: sigma  # define diffusion term

        # solve SDE
        for j in xrange(1, self.action):
            y[0][j] = np.random.normal(loc=0.0, scale=1.0)  # initial condition
            noise = np.random.normal(loc=0.0, scale=1.0,
                                     size=length) * np.sqrt(
                                         dt)  #define noise process
            for i in xrange(1, length):
                y[i][j] = y[i - 1][j] + drift(
                    y[i - 1][j], i * dt) * dt + diffusion(y[i - 1][j],
                                                          i * dt) * noise[i]
        self.OUarray = y

    def selectAction(self, state):
        #remember, state better be an autograd Variable
        ret = self.targetActor(Variable(state)).data
        ret = ret + torch.from_numpy(self.OUarray[self.step])
        self.step += 1
        return torch.clamp(ret, 0.0, 1.0)

    def addToMemory(self, state, action, reward, stateprime):
        self.memory.push(state, action, reward, stateprime)

    def primedToLearn(self):
        return self.memory.isFull()

    def PerformUpdate(self, batchsize):
        #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL
        # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch....
        self.actorOptimizer.zero_grad()
        self.criticOptimizer.zero_grad()

        batch = self.memory.batch(batchsize)
        Q = torch.zeros(len(batch), self.state + self.action)
        Qprime = torch.zeros(len(batch), self.state + self.action)
        rewards = torch.zeros(len(batch), 1)
        # This loop should generate all Q values for the batch
        i = 0
        for sample in batch:
            Q[i, :] = torch.cat((sample['s'], sample['a']))
            transition = self.targetActor(
                Variable(sample['sprime'], volatile=True)).data
            Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0)
            rewards[i, 0] = sample['r'][0]
            i += 1

        #Critic Update
        Qprime = self.gamma * self.targetCritic(
            Variable(Qprime)).data + rewards
        Qprime = Variable(Qprime)
        Q = self.critic(Variable(Q))
        criterion = torch.nn.MSELoss()
        loss = criterion(Q, Qprime)
        loss.backward()
        self.criticOptimizer.step()

        criterion = torch.nn.MSELoss()

        self.actorOptimizer.zero_grad()
        S = torch.zeros(len(batch), self.state)
        i = 0
        for sample in batch:
            S[i, :] = sample['s']
            i += 1
        A = self.actor(Variable(S))
        loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1)))
        loss.backward()
        self.actorOptimizer.step()

    def UpdateTargetNetworks(self):
        criticDict = self.critic.state_dict()
        tCriticDict = self.targetCritic.state_dict()
        for param in criticDict.keys():
            tCriticDict[param] = tCriticDict[param] * (
                1 - self.tau) + criticDict[param] * self.tau

        actorDict = self.actor.state_dict()
        tActorDict = self.targetActor.state_dict()
        for param in actorDict.keys():
            tActorDict[param] = tActorDict[param] * (
                1 - self.tau) + actorDict[param] * self.tau

        self.targetCritic.load_state_dict(tCriticDict)
        self.targetActor.load_state_dict(tActorDict)

    def saveActorCritic(self):
        torch.save(self.critic.state_dict(), './critic')
        torch.save(self.actor.state_dict(), './actor')
Example #2
0
class Agent():
	'''This agent Interacts with the environment to learn a policy that yields the highest commulative reward.
		The agent uses the Deep Deterministic Policy Gradient algorithm'''

	def __init__(self, state_size, action_size, seed=0):
		'''Initlize the Agent.
		
		Parameters
		----------
		state_size : int
			The dimension of each state
		
		action_size : int
			The dimension of each action
		
		seed : int
			The random seed used to generate random numbers.
		'''
		self.state_size = state_size
		self.action_size = action_size
		random.seed(seed)

		#actor gives the best action for given state
		self.actor_local = Actor(state_size, action_size, seed).to(device)
		self.actor_target = Actor(state_size, action_size, seed).to(device)

		#evaluates the action
		self.critic_local = Critic(state_size, action_size, seed).to(device)
		self.critic_target = Critic(state_size, action_size, seed).to(device)

		self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE)
		self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY)

		#Replay Memory
		self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

		#Noise
		self.noise = OUNoise(action_size,seed)
		self.t_step = 0

	def step(self, state, action, reward, next_state, done):
		'''Instructs the agent to take a step in the environment.

		Executes each time the agent takes a step in the environment.
		The observed (state, action, reward, next_state, done) tuple is saved in the replay buffer.
		Once enough experiences have been captured the model is trained.
		
		Parameters
		----------
		state : array_like
			The current state.
		
		action : int
			The action that was taken.

		reward : int
			The reward that was received.

		next_state : array_like
			The next state.

		done : boolean
			True if the episode is completed, else False
		'''
		self.memory.add(state, action, reward, next_state, done)
		self.t_step = (self.t_step+1)%UPDATE_EVERY
		if self.t_step == 0:
			if len(self.memory) > BATCH_SIZE:
				experiences = self.memory.sample()
				self.train_model_parameters(experiences)
	
	def get_action(self, state, epsilon=0, add_noise=True):
		'''Gets the action for the given state defined by the current policy.

		The method returns the action to take for the given state given the current policy.
		In order to explore in the continuous space noise is added to the action.
		

		Parameters
		----------
		state : array_like
			The current state.

		epsilon : float
			The epsilon value usedfor epsilon-greedy action selection.

		add_noise : boolean
			Add noise to the action to encourage exploration.

		Returns
		-------
		action : array-like
			The action to take. Each value is between -1 and 1.
		'''
		state = torch.from_numpy(state).float().unsqueeze(0).to(device)
		self.actor_local.eval()
		with torch.no_grad():
			action = self.actor_local(state).cpu().data.numpy()
		self.actor_local.train()
		if add_noise:
			action+=self.noise.sample()
		return np.clip(action,-1,1)

	def train_model_parameters(self, experiences):
		'''Update the model parameters using the given batch of experience tuples.

		The models are train via the Actor Critic paradigm.
		The next action is optained fromt he target actor.
		This is then passed to the target critic to obtain the target next state.
		The target current state is calculated via the bellman equations.
		The local critic estimates the next state and is updated accordingly.	
		The local actions predictions the next actions given the current state.
		The loss for the actor is calculated as the ...

		Parameters
		----------
		experiences : Tuple[torch.Variable]
			A name tuple of state, action, reward, next_action and done.
		'''
		states, actions, rewards, next_states, dones = experiences
		
		#Update critic
		next_actions = self.actor_target(next_states)
		Q_next_states = self.critic_target(next_states,next_actions)
		Q_states = rewards + GAMMA*Q_next_states*(1-dones)
		Q_states_estimated = self.critic_local(states,actions)
		critic_loss = F.mse_loss(Q_states_estimated, Q_states)
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()
		
		#Update actor
		actions_pred = self.actor_local(states)
		actor_loss = -self.critic_local(states,actions_pred).mean()
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()	

		self._update_model_parameters(self.critic_local, self.critic_target)     
		self._update_model_parameters(self.actor_local, self.actor_target)     

	def _update_model_parameters(self,local_network, target_network):
		'''Copy the learned local network parameters to the target network.

		This method updates the Target network with the learned network parameters.
		The target parameters are old movd TAU towards the learned local parameters.
		The is done to help redude the amount of harmful correlation by constating moving the target.
		'''
		for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
			target_param.data.copy_(TAU*local_param.data + (1-TAU) * target_param.data)
Example #3
0
class Policy:

    def save_policy(self, save_name):
        self.actor.save_model(save_name=save_name)

    def load_policy(self, load_name):
        self.actor.load_model(load_name=load_name)

    def demonstrate(self, ep_count=1):
        env = gym.make(self.envid)
        with torch.no_grad():
            for e in range(ep_count):
                done = False
                ob = env.reset()
                while not done:
                    observation = ob[None]
                    action = self.sample_action(observations=observation)
                    action = action[0]
                    ob_, r, done, _ = env.step(action)
                    env.render()
        env.close()

    def weights_init(self, m):
        if hasattr(m, 'weight'):
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0)

    def __init__(self, env_id="LunarLanderContinuous-v2"):
        self.envid = env_id
        env = gym.make(env_id)
        state_size = np.prod(list(env.observation_space.shape))
        action_size = np.prod(list(env.action_space.shape))

        # the max and min here are shorcut
        # because all spaces have same range here
        # ideally when clamping, different dimension should support different ranges
        self.low_action = env.action_space.low.max()
        self.high_action = env.action_space.high.min()

        self.actor = Actor(n_ip=state_size, n_op=action_size)
        self.critic = Critic(n_ip=state_size)

        self.actor.apply(self.weights_init)
        self.critic.apply(self.weights_init)

        if torch.cuda.is_available():
            self.use_gpu = True
            self.device = torch.device("cuda")
        else:
            self.use_gpu = False
            self.device = torch.device("cpu")

    def sample_action(self, observations):
        with torch.no_grad():
            ob = torch.Tensor(observations).to(self.device)
            m, s = self.actor(ob)
            chnk = len(m[0])  # the size of action space here
            m = m.cpu().flatten().float()
            s = s.cpu().flatten().float()
            samples = torch.normal(mean=m, std=s)
            samples = torch.clamp(samples, min=self.low_action, max=self.high_action)
            # sampled_action = samples.reshape(-1, chnk).numpy()
            sampled_action = samples.reshape(-1, chnk).detach().numpy()
            return sampled_action

    def get_log_prob(self, mean, standard_deviation, actions):

        m = mean
        s = standard_deviation
        log_prob = torch.distributions.Normal(loc=m, scale=s).log_prob(actions)
        log_prob = log_prob.sum(-1)
        return log_prob

    def improve_critic(self, data_loader, lr=0.001, batch_size=128, iterations=1):
        total_loss = 0
        total_len = 0

        optimizer = optim.Adam(self.critic.parameters(), lr=lr)
        loader = dataloader.DataLoader(data_loader, batch_size=batch_size, shuffle=True)
        for e in range(iterations):
            optimizer.zero_grad()
            for states, targets in loader:
                total_len += len(targets)
                # the targets here should be normalized
                prediction = self.critic(states)
                loss = nn.functional.mse_loss(prediction, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

        avg_loss = total_loss / total_len
        return avg_loss

    def improve_actor(self, data_loader, lr=0.001, batch_size=128, iterations=1):
        optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        loader = dataloader.DataLoader(data_loader, batch_size=batch_size, shuffle=True)
        for e in range(iterations):
            for states, actions, values in loader:
                optimizer.zero_grad()
                m, s = self.actor(states)
                lp = self.get_log_prob(mean=m, standard_deviation=s, actions=actions)
                loss = torch.sum(-(lp * values))
                loss.backward()
                optimizer.step()
Example #4
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        
        logging.warning(action)
        return np.clip(action, 0.0000001, 7.0)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    def __init__(self, params):
        self.action_size = params['action_size']
        self.state_size = params['state_size']
        self.num_agents = params['num_agents']
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__update_every = params['update_every']
        self.__save_to = params['save_to']
        self.__memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.__lr = params['lr']
        self.noise_type = params['noise_type']

        actor_params = dict()
        actor_params['arch_params_actor'] = params['arch_params_actor']
        actor_params['action_size'] = self.action_size
        actor_params['state_size'] = self.state_size
        actor_params['eps'] = params['eps']
        actor_params['eps_decay'] = params['eps_decay']
        actor_params['eps_min'] = params['min_eps']
        actor_params['noise_type'] = params['noise_type']
        self.actor = Actor(actor_params)
        self.actor_target = Actor(actor_params)
        self.optimizer_actor = optim.Adam(self.actor.parameters(),
                                          lr=self.__lr)
        self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                         step_size=100,
                                                         gamma=0.95)

        critic_params = dict()
        critic_params['arch_params_critic'] = params['arch_params_critic']
        critic_params['action_size'] = self.action_size
        critic_params['state_size'] = self.state_size
        self.critic = Critic(critic_params)
        self.critic_target = Critic(critic_params)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=self.__lr)
        self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                          step_size=100,
                                                          gamma=0.95)
        self.__t = 0

    def memorize_experience(self, state, action, reward, next_state, done):
        self.__memory.add(state, action.detach(), reward, next_state, done)
        self.__t = (self.__t + 1)

    def learn_from_past_experiences(self):
        if self.__t % self.__update_every == 0:
            if len(self.__memory) > self.batch_size:
                experiences = self.__memory.sample()
                self.update_actor_critic(experiences)

    def choose_action(self, state):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state.astype(dtype=np.float)).to(device)
        action, action_perturbed = self.actor(state)
        return action, action_perturbed

    def update_actor_critic(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        next_actions, next_actions_perturbed = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones)
                               )  # if done == True: second term is equal to 0
        Q_expected = self.critic(states, actions)
        loss_func = nn.MSELoss()
        loss_critic = loss_func(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        # self.scheduler_critic.step()
        self.optimizer_critic.step()

        predicted_actions, predicted_actions_perturbed = self.actor(
            states)  # new predicted actions, not the ones stored in buffer

        if self.noise_type == 'parameter':
            #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise
            if (predicted_actions -
                    predicted_actions_perturbed).pow(2).mean() >= 0.15:
                self.actor.eps /= 1.01
                self.actor_target.eps /= 1.01
            else:
                self.actor.eps *= 1.01
                self.actor_target.eps *= 1.01

        loss_actor = -self.critic(states, predicted_actions).mean()

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        # self.scheduler_actor.step()
        self.optimizer_actor.step()

        self.soft_update(self.critic, self.critic_target)
        self.soft_update(self.actor, self.actor_target)

    def update_eps(self):
        self.actor.eps = max(self.actor.eps * self.actor.eps_decay,
                             self.actor.eps_min)
        self.actor_target.eps = max(
            self.actor_target.eps * self.actor_target.eps_decay,
            self.actor_target.eps_min)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.__tau * local_param.data +
                                    (1.0 - self.__tau) * target_param.data)

    def save_weights(self, save_to):
        actor_params_and_state_dict = {
            'actor_params': self.actor.actor_params,
            'state_dict': self.actor.state_dict()
        }
        critic_params_and_state_dict = {
            'critic_params': self.critic.critic_params,
            'state_dict': self.critic.state_dict()
        }

        file = dict()
        file['critic_params_and_state_dict'] = critic_params_and_state_dict
        file['actor_params_and_state_dict'] = actor_params_and_state_dict
        torch.save(file, open(save_to, 'wb'))

    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)

        critic_params_and_state_dict = checkpoint[
            'critic_params_and_state_dict']
        actor_params_and_state_dict = checkpoint['actor_params_and_state_dict']

        self.actor = Actor(actor_params_and_state_dict['actor_params'])
        self.actor.load_state_dict(actor_params_and_state_dict['state_dict'])

        self.critic = Critic(critic_params_and_state_dict['critic_params'])
        self.critic.load_state_dict(critic_params_and_state_dict['state_dict'])
        return self
Example #6
0
class ActorCritic:
	def __init__(self, state_dim, action_dim, memory, load):
		self.memory = memory
		self.noise = OrnsteinUhlenbeckActionNoise(action_dim)

		self.actor = Actor(state_dim, action_dim)
		self.critic = Critic(state_dim, action_dim)
		self.target_actor = Actor(state_dim, action_dim)
		self.target_critic = Critic(state_dim, action_dim)

		self.critic.cuda()
		self.actor.cuda()
		self.target_critic.cuda()
		self.target_actor.cuda()

		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),LEARNING_RATE)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),LEARNING_RATE)

		self.loss_funct = nn.SmoothL1Loss()
		if load != 0:
			self.load_models(load) #load the model

# Target and trained networks are the same when initializing
		self.net_update(self.target_actor, self.actor, True)
		self.net_update(self.target_critic, self.critic, True)

# Predict an action with or without noise depending on the "train" flag
	def get_action(self, state, train):
		state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor))
		action = self.actor.forward(state).detach().cpu().numpy()
		if train:
			noise = np.float32(self.noise.sample())
			return action + noise
		return action

# Run the optimization:
#	Get predicted action from the next state by Target Actor
#	Base on that predict the Value of that action by Target Critic
#	Use the predicted value to update Critic, and then Actor
#	Soft update target networks to mirror the progress
	def optimize(self):
		state,action,reward,next_state = self.memory.sample(BATCH_SIZE)

		state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor))
		action = Variable(torch.from_numpy(np.float32(action)).type(torch.cuda.FloatTensor))
		reward = Variable(torch.from_numpy(np.float32(reward)).type(torch.cuda.FloatTensor))
		next_state = Variable(torch.from_numpy(np.float32(next_state)).type(torch.cuda.FloatTensor))

		next_action = self.target_actor.forward(next_state).detach()
		target = reward + GAMMA*torch.squeeze(self.target_critic.forward(next_state, next_action).detach())

		prediction = torch.squeeze(self.critic.forward(state, action))

		loss_critic = self.loss_funct(prediction, target)
		self.critic_optimizer.zero_grad()
		loss_critic.backward()
		self.critic_optimizer.step()

		action = self.actor.forward(state)
		loss_actor = -1*torch.sum(self.critic.forward(state, action))
		self.actor_optimizer.zero_grad()
		loss_actor.backward()
		self.actor_optimizer.step()

		self.net_update(self.target_actor, self.actor, False)
		self.net_update(self.target_critic, self.critic, False)

	# Apply soft or hard update on the network
	def net_update(self,target, source, hard):
		degree = 1
		if not hard: degree = TAU
		for target_param, param in zip(target.parameters(), source.parameters()):
			target_param.data.copy_(target_param.data * (1.0 - degree) + param.data * degree)

# Store the models
	def save_models(self, episode):
		torch.save(self.target_actor.state_dict(), 'Models/' + str(episode) + '_actor.pt')
		torch.save(self.target_critic.state_dict(), 'Models/' + str(episode) + '_critic.pt')
		
# Load the models
	def load_models(self, episode):
		self.actor.load_state_dict(torch.load('Models/' + str(episode) + '_actor.pt'))
		self.critic.load_state_dict(torch.load('Models/' + str(episode) + '_critic.pt'))
		self.net_update(self.target_actor, self.actor, True)
		self.net_update(self.target_critic, self.critic, True)
		print('Models loaded succesfully')
Example #7
0
class DDPG_Agent():
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 1

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                random_seed)

        # Noise process
        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

    def step(self, states, actions, rewards, next_states, dones):
        """ add an experience in the reply buffer 
        then sample randomly from that buffer to learn (reason behind the random sampling is to break 
        the correlation between sequential experiences)
        """
        # Save experience
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state """
        states = torch.from_numpy(states).float().to(device)

        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                # Populate list of actions one state at a time
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            # We add noise for exploration purposes
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ### Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Calculate Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer.step()

        ### Update actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, regular_model, target_model, tau):
        """
            regular_model: it's the most up to date model as it's the one used for trainning 
            target_model:this one is the most stable we copy the weights of the regular model to it 
            tau (float): interpolation parameter 
        """
        for target_param, regular_param in zip(target_model.parameters(),
                                               regular_model.parameters()):
            target_param.data.copy_(tau * regular_param.data +
                                    (1.0 - tau) * target_param.data)
Example #8
0
class TD3:
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = copy.deepcopy(self.eval_actor_net)
        self.target_actor_net.eval()

        self.eval_critic_net1 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net1)
        self.eval_critic_net1.train()

        self.eval_critic_net2 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net2)
        self.eval_critic_net2.train()

        self.target_critic_net1 = copy.deepcopy(self.eval_critic_net1)
        self.target_critic_net1.eval()
        self.target_critic_net2 = copy.deepcopy(self.eval_critic_net2)
        self.target_critic_net2.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(
            self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        # itertools.chain(self.encoder.parameters(), self.decoder.parameters())
        # self.optimizer_critic = \
        #     torch.optim.Adam([{'params': self.eval_critic_net1.parameters()},
        #                       {'params': self.eval_critic_net2.parameters()}], Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic1 = \
            torch.optim.Adam(self.eval_critic_net1.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic2 = \
            torch.optim.Adam(self.eval_critic_net2.parameters(), Config.LR_CRITIC, (0.9, 0.99))

        self.gamma = Config.REWARD_DECAY
        self.policy_noise_clip = Config.POLICY_NOISE_CLIP
        self.policy_delay = Config.DELAY_POLICY_UPDATE_ITER
        self.learn_iter = 0

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_transition(self, s, a, r, s_):
        self.memory.store([s, a, r, s_])

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        action = self.eval_actor_net(s).detach().squeeze(dim=0)
        return action

    def learn(self):
        self.learn_iter += 1
        # for x in self.Actor_target.state_dict().keys():
        #     eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # for target_param, param in zip(net_target.parameters(), net.parameters()):
        #     target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        # for k, v in self.eval_critic_net.state_dict().items():
        #     self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k])
        # for k, v in self.eval_actor_net.state_dict().items():
        #     self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k])

        batch_data = self.memory.sample(self.batch_size)
        s0, a0, r1, s1 = zip(*batch_data)
        s0 = torch.tensor(s0, dtype=torch.float)
        a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size,
                                                      len(self.action_bounds))
        r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor(s1, dtype=torch.float)

        # Select action according to policy and add clipped noise

        # Input (s, a), output q
        q_s0_a0_1 = self.eval_critic_net1(s0, a0)
        q_s0_a0_2 = self.eval_critic_net2(s0, a0)
        # Input (s_, a_), output q_ for q_target
        # 得到a_
        noise = (torch.randn_like(a0) * self.policy_noise_clip * 2).clamp(
            -self.policy_noise_clip, self.policy_noise_clip)
        a1 = self.target_actor_net(s1).detach() + noise
        action_bound = self.action_bounds.expand_as(a1)
        a1[a1 < -action_bound] = -action_bound[a1 < -action_bound]
        a1[a1 > action_bound] = action_bound[a1 > action_bound]

        q_s1_a1_1 = self.target_critic_net1(s1, a1).detach()
        q_s1_a1_2 = self.target_critic_net2(s1, a1).detach()
        q_s1_a1 = torch.min(q_s1_a1_1, q_s1_a1_2)
        q_target = r1 + self.gamma * q_s1_a1

        loss_critic = nn.MSELoss()(q_s0_a0_1, q_target) + nn.MSELoss()(
            q_s0_a0_2, q_target)

        # critic 学习过程
        # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,
        # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2
        self.optimizer_critic1.zero_grad()
        self.optimizer_critic2.zero_grad()
        loss_critic.backward()
        self.optimizer_critic1.step()
        self.optimizer_critic2.step()
        loss_actor = 0
        # actor 学习过程
        # https://zhuanlan.zhihu.com/p/84321382
        # Delayed policy updates
        if self.learn_iter % self.policy_delay == 0:
            actor_a = self.eval_actor_net(s0)
            critic_q = self.eval_critic_net1(s0, actor_a)
            # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
            # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
            loss_actor = -torch.mean(critic_q)

            self.optimizer_actor.zero_grad()
            loss_actor.backward()
            self.optimizer_actor.step()
            # Update the frozen target models
            for param, target_param in zip(
                    self.eval_critic_net1.parameters(),
                    self.target_critic_net1.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
            for param, target_param in zip(
                    self.eval_critic_net2.parameters(),
                    self.target_critic_net2.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
            for param, target_param in zip(self.eval_actor_net.parameters(),
                                           self.target_actor_net.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss) + 1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
class Agent:
    """
    Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use prioritized sample from buffer to learn.
        """

        # Save memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY

        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # Learn from memory if enough samples exist
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[i, :] = action

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices = experiences

        # update Critic
        # Get next predicted state, actions, and Q values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update priorities
        delta = abs(Q_targets - Q_expected).detach().numpy()
        self.memory.update_priorities(delta, indices)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """

        for target_model_param, local_model_param in zip(
                target_model.parameters(), local_model.parameters()):
            target_model_param.data.copy_(tau * local_model_param.data +
                                          (1. - tau) * target_model_param.data)
Example #10
0
class AsyncDDPG(object):
    def __init__(self,
                 gamma,
                 s,
                 a,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87)
        self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.count = 0

    def PerformUpdate(self, batchsize, target):
        #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL
        # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch....
        self.actorOptimizer.zero_grad()
        self.criticOptimizer.zero_grad()

        batch = target.getBatchMemory(batchsize)

        Q = torch.zeros(len(batch), self.state + self.action)
        Qprime = torch.zeros(len(batch), self.state + self.action)
        rewards = torch.zeros(len(batch), 1)

        # This loop should generate all Q values for the batch
        i = 0
        for sample in batch:
            Q[i, :] = torch.cat((sample['s'], sample['a']))
            transition = target.targetActor(
                Variable(sample['sprime'], volatile=True)).data
            Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0)
            rewards[i, 0] = sample['r'][0]
            i += 1

        #Critic Update
        Qprime = self.gamma * target.targetCritic(
            Variable(Qprime)).data + rewards
        Qprime = Variable(Qprime)

        Q = self.critic(Variable(Q))
        criterion = torch.nn.MSELoss()
        loss = criterion(Q, Qprime)
        loss.backward()
        self.criticOptimizer.step()

        criterion = torch.nn.MSELoss()
        #criticupdate
        self.actorOptimizer.zero_grad()
        S = torch.zeros(len(batch), self.state)
        i = 0
        for sample in batch:
            S[i, :] = sample['s']
            i += 1
        A = self.actor(Variable(S))
        loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1)))
        loss.backward()
        self.actorOptimizer.step()

    def getActor(self):
        return self.actor

    def getCritic(self):
        return self.critic

    def ProduceTargetActorCritic(self, memory=2000, tau=.25, epsilon=.5):
        print(self.count)
        self.count += 1
        s = self.state
        a = self.action
        return TargetActorCritic(self.actor,
                                 self.critic,
                                 memory,
                                 s,
                                 a,
                                 tau,
                                 epsilon=0.5)

    def saveActorCritic(self):
        torch.save(self.critic.state_dict(), './AsyncCritic')
        torch.save(self.actor.state_dict(), './AsyncActor')
class Actor_Critic:
    def __init__(self, n_features, actions=None, is_continues=None):
        self.actions = actions
        self.is_continues = is_continues
        self.actor_net = Actor(n_features,
                               actions=actions,
                               is_continues=is_continues)
        self.critic_net = Critic(n_features)
        self.load_weights(self.actor_net)
        self.load_weights(self.critic_net)
        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                Config.LR_ACTOR, (0.9, 0.99))
        self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(),
                                                 Config.LR_CRITIC, (0.9, 0.99))
        self.gamma = Config.REWARD_DECAY

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_trajectory(self, s, a, r):
        self.states.append(s)
        self.actions.append(a)
        self.rewards.append(r)

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        if self.is_continues:
            mu, sigma = self.actor_net(s)
            mu, sigma = mu.detach().squeeze(), sigma.detach().squeeze()
            normal_dist = torch.distributions.Normal(mu * 2, sigma + 0.1)
            action = torch.clamp(normal_dist.sample((1, )),
                                 min=-self.actions[0],
                                 max=self.actions[0])
        else:
            # 每个动作的概率
            actions_probs = F.softmax(self.actor_net(s).detach(), dim=1)

            # 根据概率选动作
            action = random.choices(range(actions_probs.size(1)),
                                    weights=actions_probs.squeeze(0))[0]
        return action

    def learn(self, s, a, r, s_):
        s = torch.from_numpy(s).unsqueeze(dim=0).float()
        s_ = torch.from_numpy(s_).unsqueeze(dim=0).float()
        r = torch.tensor(r)
        a = torch.tensor(a).unsqueeze(dim=0)
        V_st = self.critic_net(s).squeeze(dim=0)
        V_st_ = self.critic_net(s_).squeeze(dim=0)
        # td_error = Q(st, at) - V(st)
        # Q(st, at) = r + r*V(st+1)
        # td_error = - V(st) + r + gamma*V(st+1)
        td_error = r + self.gamma * V_st_ - V_st
        loss_critic = td_error**2

        # critic 学习过程
        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        # actor 学习过程
        mu, sigma = self.actor_net(s)
        mu, sigma = mu.squeeze(), sigma.squeeze()
        normal_dist = torch.distributions.Normal(mu * 2, sigma + 0.1)
        log_prob = normal_dist.log_prob(a)
        loss_actor = torch.sum(log_prob * td_error.detach())
        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()
        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss) + 1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
Example #12
0
class Actor_Critic:
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = Actor(n_features, action_bounds)
        self.target_actor_net.eval()
        self.eval_critic_net = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net)
        self.eval_critic_net.train()
        self.target_critic_net = Critic(n_features, action_bounds)
        self.target_critic_net.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.gamma = Config.REWARD_DECAY

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_transition(self, s, a, r, s_):
        self.memory.store([s, a, r, s_])

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        action = self.eval_actor_net(s).detach().squeeze(dim=0)
        return action

    def learn(self):
        # for x in self.Actor_target.state_dict().keys():
        #     eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # for target_param, param in zip(net_target.parameters(), net.parameters()):
        #     target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        for k, v in self.eval_critic_net.state_dict().items():
            self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k])
        for k, v in self.eval_actor_net.state_dict().items():
            self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k])

        batch_data = self.memory.sample(self.batch_size)
        s0, a0, r1, s1 = zip(*batch_data)
        s0 = torch.tensor(s0, dtype=torch.float)
        a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds))
        r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor(s1, dtype=torch.float)

        # Input (s, a), output q
        q_s0_a0 = self.eval_critic_net(s0, a0)
        # Input (s_, a_), output q_ for q_target
        # 得到a_
        a1 = self.target_actor_net(s1).detach()
        q_s1_a1 = self.target_critic_net(s1, a1).detach()
        q_target = r1 + self.gamma * q_s1_a1
        loss_critic = nn.MSELoss()(q_s0_a0, q_target)

        # critic 学习过程
        # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,
        # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2
        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        # actor 学习过程
        # https://zhuanlan.zhihu.com/p/84321382
        actor_a = self.eval_actor_net(s0)
        critic_q = self.eval_critic_net(s0, actor_a)
        # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
        # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
        loss_actor = -torch.mean(critic_q)

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()
        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss)+1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
Example #13
0
class DDPG_Agent():
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 10.0
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                self.random_seed)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Make sure the Actor Target Network has the same weight values as the Local Network
        for target, local in zip(self.actor_target.parameters(),
                                 self.actor_local.parameters()):
            target.data.copy_(local.data)

        # Critic Network (w/ Target Network)

        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)
        """
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        """

        # Make sure the Critic Target Network has the same weight values as the Local Network
        for target, local in zip(self.critic_target.parameters(),
                                 self.critic_local.parameters()):
            target.data.copy_(local.data)

        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, noise=0.0):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if ADD_NOISE:
            action += self.noise.sample() * noise
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        ### Used only for DDPG (use madddpg.maddpg_learn() for MADDPG)
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #14
0
import gym
from Actor import Actor
from Critic import Critic

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

env = gym.make("Pendulum-v0")

actor = Actor(env.observation_space.shape[0], env.action_space.shape[0])
critic = Critic(env.observation_space.shape[0])

criterion = nn.MSELoss().to(device)

Actor_Optimizer = optim.SGD(actor.parameters(), lr = 1e-4)
Critic_Optimizer = optim.SGD(critic.parameters(), lr = 1e-4)

for episode in range(5):
    state = env.reset()
    returns = np.zeros(1000)
    for step in range(1000):
        pass