Esempio n. 1
0
class DDPG:
    def __init__(self,
                 gamma,
                 memory,
                 s,
                 a,
                 tau,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.memory = ReplayMemory(memory)
        self.actor = Actor(state=s, actions=a)
        self.critic = Critic(state=s, actions=a)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))
        self.targetActor = Actor(state=s, actions=a)
        self.targetActor.load_state_dict(self.actor.state_dict())
        self.targetCritic = Critic(state=s, actions=a)
        self.targetCritic.load_state_dict(self.critic.state_dict())
        self.tau = tau

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.OUarray = np.zeros((1000, self.action), dtype="f")
        self.step = 0

    def processNoise(self):
        #this should be something more eloquent....
        ret = torch.rand(self.action)
        for i in range(0, self.action):
            r = random.random()
            if (r <= .33):
                ret[i] = ret[i]
            elif (.33 < r and r <= .66):
                ret[i] = 0
            else:
                ret[i] = -ret[i]
        return ret

    def OUprocess(self, sigma, theta, mu):
        # define model parameters
        t_0 = 0
        t_end = 10
        length = 1000

        y = np.zeros((length, self.action), dtype="f")
        t = np.linspace(t_0, t_end, length)  # define time axis
        dt = np.mean(np.diff(t))
        drift = lambda y, t: theta * (mu - y)  # define drift term
        diffusion = lambda y, t: sigma  # define diffusion term

        # solve SDE
        for j in xrange(1, self.action):
            y[0][j] = np.random.normal(loc=0.0, scale=1.0)  # initial condition
            noise = np.random.normal(loc=0.0, scale=1.0,
                                     size=length) * np.sqrt(
                                         dt)  #define noise process
            for i in xrange(1, length):
                y[i][j] = y[i - 1][j] + drift(
                    y[i - 1][j], i * dt) * dt + diffusion(y[i - 1][j],
                                                          i * dt) * noise[i]
        self.OUarray = y

    def selectAction(self, state):
        #remember, state better be an autograd Variable
        ret = self.targetActor(Variable(state)).data
        ret = ret + torch.from_numpy(self.OUarray[self.step])
        self.step += 1
        return torch.clamp(ret, 0.0, 1.0)

    def addToMemory(self, state, action, reward, stateprime):
        self.memory.push(state, action, reward, stateprime)

    def primedToLearn(self):
        return self.memory.isFull()

    def PerformUpdate(self, batchsize):
        #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL
        # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch....
        self.actorOptimizer.zero_grad()
        self.criticOptimizer.zero_grad()

        batch = self.memory.batch(batchsize)
        Q = torch.zeros(len(batch), self.state + self.action)
        Qprime = torch.zeros(len(batch), self.state + self.action)
        rewards = torch.zeros(len(batch), 1)
        # This loop should generate all Q values for the batch
        i = 0
        for sample in batch:
            Q[i, :] = torch.cat((sample['s'], sample['a']))
            transition = self.targetActor(
                Variable(sample['sprime'], volatile=True)).data
            Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0)
            rewards[i, 0] = sample['r'][0]
            i += 1

        #Critic Update
        Qprime = self.gamma * self.targetCritic(
            Variable(Qprime)).data + rewards
        Qprime = Variable(Qprime)
        Q = self.critic(Variable(Q))
        criterion = torch.nn.MSELoss()
        loss = criterion(Q, Qprime)
        loss.backward()
        self.criticOptimizer.step()

        criterion = torch.nn.MSELoss()

        self.actorOptimizer.zero_grad()
        S = torch.zeros(len(batch), self.state)
        i = 0
        for sample in batch:
            S[i, :] = sample['s']
            i += 1
        A = self.actor(Variable(S))
        loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1)))
        loss.backward()
        self.actorOptimizer.step()

    def UpdateTargetNetworks(self):
        criticDict = self.critic.state_dict()
        tCriticDict = self.targetCritic.state_dict()
        for param in criticDict.keys():
            tCriticDict[param] = tCriticDict[param] * (
                1 - self.tau) + criticDict[param] * self.tau

        actorDict = self.actor.state_dict()
        tActorDict = self.targetActor.state_dict()
        for param in actorDict.keys():
            tActorDict[param] = tActorDict[param] * (
                1 - self.tau) + actorDict[param] * self.tau

        self.targetCritic.load_state_dict(tCriticDict)
        self.targetActor.load_state_dict(tActorDict)

    def saveActorCritic(self):
        torch.save(self.critic.state_dict(), './critic')
        torch.save(self.actor.state_dict(), './actor')
class Agent():
    def __init__(self, params):
        self.action_size = params['action_size']
        self.state_size = params['state_size']
        self.num_agents = params['num_agents']
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__update_every = params['update_every']
        self.__save_to = params['save_to']
        self.__memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.__lr = params['lr']
        self.noise_type = params['noise_type']

        actor_params = dict()
        actor_params['arch_params_actor'] = params['arch_params_actor']
        actor_params['action_size'] = self.action_size
        actor_params['state_size'] = self.state_size
        actor_params['eps'] = params['eps']
        actor_params['eps_decay'] = params['eps_decay']
        actor_params['eps_min'] = params['min_eps']
        actor_params['noise_type'] = params['noise_type']
        self.actor = Actor(actor_params)
        self.actor_target = Actor(actor_params)
        self.optimizer_actor = optim.Adam(self.actor.parameters(),
                                          lr=self.__lr)
        self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                         step_size=100,
                                                         gamma=0.95)

        critic_params = dict()
        critic_params['arch_params_critic'] = params['arch_params_critic']
        critic_params['action_size'] = self.action_size
        critic_params['state_size'] = self.state_size
        self.critic = Critic(critic_params)
        self.critic_target = Critic(critic_params)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=self.__lr)
        self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                          step_size=100,
                                                          gamma=0.95)
        self.__t = 0

    def memorize_experience(self, state, action, reward, next_state, done):
        self.__memory.add(state, action.detach(), reward, next_state, done)
        self.__t = (self.__t + 1)

    def learn_from_past_experiences(self):
        if self.__t % self.__update_every == 0:
            if len(self.__memory) > self.batch_size:
                experiences = self.__memory.sample()
                self.update_actor_critic(experiences)

    def choose_action(self, state):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state.astype(dtype=np.float)).to(device)
        action, action_perturbed = self.actor(state)
        return action, action_perturbed

    def update_actor_critic(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        next_actions, next_actions_perturbed = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones)
                               )  # if done == True: second term is equal to 0
        Q_expected = self.critic(states, actions)
        loss_func = nn.MSELoss()
        loss_critic = loss_func(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        # self.scheduler_critic.step()
        self.optimizer_critic.step()

        predicted_actions, predicted_actions_perturbed = self.actor(
            states)  # new predicted actions, not the ones stored in buffer

        if self.noise_type == 'parameter':
            #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise
            if (predicted_actions -
                    predicted_actions_perturbed).pow(2).mean() >= 0.15:
                self.actor.eps /= 1.01
                self.actor_target.eps /= 1.01
            else:
                self.actor.eps *= 1.01
                self.actor_target.eps *= 1.01

        loss_actor = -self.critic(states, predicted_actions).mean()

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        # self.scheduler_actor.step()
        self.optimizer_actor.step()

        self.soft_update(self.critic, self.critic_target)
        self.soft_update(self.actor, self.actor_target)

    def update_eps(self):
        self.actor.eps = max(self.actor.eps * self.actor.eps_decay,
                             self.actor.eps_min)
        self.actor_target.eps = max(
            self.actor_target.eps * self.actor_target.eps_decay,
            self.actor_target.eps_min)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.__tau * local_param.data +
                                    (1.0 - self.__tau) * target_param.data)

    def save_weights(self, save_to):
        actor_params_and_state_dict = {
            'actor_params': self.actor.actor_params,
            'state_dict': self.actor.state_dict()
        }
        critic_params_and_state_dict = {
            'critic_params': self.critic.critic_params,
            'state_dict': self.critic.state_dict()
        }

        file = dict()
        file['critic_params_and_state_dict'] = critic_params_and_state_dict
        file['actor_params_and_state_dict'] = actor_params_and_state_dict
        torch.save(file, open(save_to, 'wb'))

    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)

        critic_params_and_state_dict = checkpoint[
            'critic_params_and_state_dict']
        actor_params_and_state_dict = checkpoint['actor_params_and_state_dict']

        self.actor = Actor(actor_params_and_state_dict['actor_params'])
        self.actor.load_state_dict(actor_params_and_state_dict['state_dict'])

        self.critic = Critic(critic_params_and_state_dict['critic_params'])
        self.critic.load_state_dict(critic_params_and_state_dict['state_dict'])
        return self
Esempio n. 3
0
class AsyncDDPG(object):
    def __init__(self,
                 gamma,
                 s,
                 a,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87)
        self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.count = 0

    def PerformUpdate(self, batchsize, target):
        #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL
        # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch....
        self.actorOptimizer.zero_grad()
        self.criticOptimizer.zero_grad()

        batch = target.getBatchMemory(batchsize)

        Q = torch.zeros(len(batch), self.state + self.action)
        Qprime = torch.zeros(len(batch), self.state + self.action)
        rewards = torch.zeros(len(batch), 1)

        # This loop should generate all Q values for the batch
        i = 0
        for sample in batch:
            Q[i, :] = torch.cat((sample['s'], sample['a']))
            transition = target.targetActor(
                Variable(sample['sprime'], volatile=True)).data
            Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0)
            rewards[i, 0] = sample['r'][0]
            i += 1

        #Critic Update
        Qprime = self.gamma * target.targetCritic(
            Variable(Qprime)).data + rewards
        Qprime = Variable(Qprime)

        Q = self.critic(Variable(Q))
        criterion = torch.nn.MSELoss()
        loss = criterion(Q, Qprime)
        loss.backward()
        self.criticOptimizer.step()

        criterion = torch.nn.MSELoss()
        #criticupdate
        self.actorOptimizer.zero_grad()
        S = torch.zeros(len(batch), self.state)
        i = 0
        for sample in batch:
            S[i, :] = sample['s']
            i += 1
        A = self.actor(Variable(S))
        loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1)))
        loss.backward()
        self.actorOptimizer.step()

    def getActor(self):
        return self.actor

    def getCritic(self):
        return self.critic

    def ProduceTargetActorCritic(self, memory=2000, tau=.25, epsilon=.5):
        print(self.count)
        self.count += 1
        s = self.state
        a = self.action
        return TargetActorCritic(self.actor,
                                 self.critic,
                                 memory,
                                 s,
                                 a,
                                 tau,
                                 epsilon=0.5)

    def saveActorCritic(self):
        torch.save(self.critic.state_dict(), './AsyncCritic')
        torch.save(self.actor.state_dict(), './AsyncActor')
Esempio n. 4
0
class ActorCritic:
	def __init__(self, state_dim, action_dim, memory, load):
		self.memory = memory
		self.noise = OrnsteinUhlenbeckActionNoise(action_dim)

		self.actor = Actor(state_dim, action_dim)
		self.critic = Critic(state_dim, action_dim)
		self.target_actor = Actor(state_dim, action_dim)
		self.target_critic = Critic(state_dim, action_dim)

		self.critic.cuda()
		self.actor.cuda()
		self.target_critic.cuda()
		self.target_actor.cuda()

		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),LEARNING_RATE)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),LEARNING_RATE)

		self.loss_funct = nn.SmoothL1Loss()
		if load != 0:
			self.load_models(load) #load the model

# Target and trained networks are the same when initializing
		self.net_update(self.target_actor, self.actor, True)
		self.net_update(self.target_critic, self.critic, True)

# Predict an action with or without noise depending on the "train" flag
	def get_action(self, state, train):
		state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor))
		action = self.actor.forward(state).detach().cpu().numpy()
		if train:
			noise = np.float32(self.noise.sample())
			return action + noise
		return action

# Run the optimization:
#	Get predicted action from the next state by Target Actor
#	Base on that predict the Value of that action by Target Critic
#	Use the predicted value to update Critic, and then Actor
#	Soft update target networks to mirror the progress
	def optimize(self):
		state,action,reward,next_state = self.memory.sample(BATCH_SIZE)

		state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor))
		action = Variable(torch.from_numpy(np.float32(action)).type(torch.cuda.FloatTensor))
		reward = Variable(torch.from_numpy(np.float32(reward)).type(torch.cuda.FloatTensor))
		next_state = Variable(torch.from_numpy(np.float32(next_state)).type(torch.cuda.FloatTensor))

		next_action = self.target_actor.forward(next_state).detach()
		target = reward + GAMMA*torch.squeeze(self.target_critic.forward(next_state, next_action).detach())

		prediction = torch.squeeze(self.critic.forward(state, action))

		loss_critic = self.loss_funct(prediction, target)
		self.critic_optimizer.zero_grad()
		loss_critic.backward()
		self.critic_optimizer.step()

		action = self.actor.forward(state)
		loss_actor = -1*torch.sum(self.critic.forward(state, action))
		self.actor_optimizer.zero_grad()
		loss_actor.backward()
		self.actor_optimizer.step()

		self.net_update(self.target_actor, self.actor, False)
		self.net_update(self.target_critic, self.critic, False)

	# Apply soft or hard update on the network
	def net_update(self,target, source, hard):
		degree = 1
		if not hard: degree = TAU
		for target_param, param in zip(target.parameters(), source.parameters()):
			target_param.data.copy_(target_param.data * (1.0 - degree) + param.data * degree)

# Store the models
	def save_models(self, episode):
		torch.save(self.target_actor.state_dict(), 'Models/' + str(episode) + '_actor.pt')
		torch.save(self.target_critic.state_dict(), 'Models/' + str(episode) + '_critic.pt')
		
# Load the models
	def load_models(self, episode):
		self.actor.load_state_dict(torch.load('Models/' + str(episode) + '_actor.pt'))
		self.critic.load_state_dict(torch.load('Models/' + str(episode) + '_critic.pt'))
		self.net_update(self.target_actor, self.actor, True)
		self.net_update(self.target_critic, self.critic, True)
		print('Models loaded succesfully')
Esempio n. 5
0
class Actor_Critic:
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = Actor(n_features, action_bounds)
        self.target_actor_net.eval()
        self.eval_critic_net = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net)
        self.eval_critic_net.train()
        self.target_critic_net = Critic(n_features, action_bounds)
        self.target_critic_net.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.gamma = Config.REWARD_DECAY

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0.1)

    def store_transition(self, s, a, r, s_):
        self.memory.store([s, a, r, s_])

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        action = self.eval_actor_net(s).detach().squeeze(dim=0)
        return action

    def learn(self):
        # for x in self.Actor_target.state_dict().keys():
        #     eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # for target_param, param in zip(net_target.parameters(), net.parameters()):
        #     target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        for k, v in self.eval_critic_net.state_dict().items():
            self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k])
        for k, v in self.eval_actor_net.state_dict().items():
            self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k])

        batch_data = self.memory.sample(self.batch_size)
        s0, a0, r1, s1 = zip(*batch_data)
        s0 = torch.tensor(s0, dtype=torch.float)
        a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds))
        r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor(s1, dtype=torch.float)

        # Input (s, a), output q
        q_s0_a0 = self.eval_critic_net(s0, a0)
        # Input (s_, a_), output q_ for q_target
        # 得到a_
        a1 = self.target_actor_net(s1).detach()
        q_s1_a1 = self.target_critic_net(s1, a1).detach()
        q_target = r1 + self.gamma * q_s1_a1
        loss_critic = nn.MSELoss()(q_s0_a0, q_target)

        # critic 学习过程
        # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,
        # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2
        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        # actor 学习过程
        # https://zhuanlan.zhihu.com/p/84321382
        actor_a = self.eval_actor_net(s0)
        critic_q = self.eval_critic_net(s0, actor_a)
        # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
        # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
        loss_actor = -torch.mean(critic_q)

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()
        return loss_critic, loss_actor

    def draw_curve(self, loss):
        x = np.arange(1, len(loss)+1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()