예제 #1
0
def main():
    policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
    policy_net.apply(init_weights)
    if pretrained:
        ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth')
        policy_net.load_state_dict(
            {k.replace('module.', ''): v
             for k, v in ckp.items()})
    target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
    target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
    target_net.eval()
    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=learning_rate)  #定义优化器Adam,可以更换
    buffer = ReplayBuffer(
        buffer_size
    )  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
    criterion = torch.nn.MSELoss(reduction='sum')

    # training
    for i_episode in range(num_episodes):

        state0 = [user_loc, user_dis, node_loc, use_buff]  #获得一个初始化状态
        error = 0.0
        all_reward = 0
        for t in count():
            # 选择动作
            action = e_greedy_select_action(state0, policy_net)
            a = np.array([action.data.cpu().numpy()])
            #print("action selected by e_greedy is {}".format(action))
            # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
            state1, done, flag = transition_function(state0, action)
            # 利用奖励函数,获得当前的奖励值
            reward, cost_migration = reward_function(state0, action, state1,
                                                     flag)
            all_reward = all_reward + reward
            # 将经验数据存储至buffer中
            buffer.add(state0, a, reward, state1, done)

            # exit an episode after MAX_T steps
            if t > MAX_T:
                break

            #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
            if i_episode > 1:

                # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
                batch = buffer.getBatch(BATCH_SIZE)

                policy_net, target_net, bellman_error = optimize_model(
                    batch, policy_net, target_net, optimizer_policy, criterion)
                error = error + bellman_error.data.cpu().numpy()
            # 进入下一状态
            state0 = state1
        ave_error = error / (t * 1.00)
        ave_reward = all_reward / (t * 1.00)
        print(ave_error, ave_reward)
    torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
예제 #2
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        #self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward

        if self.total_reward > self.best_total_reward:
            self.best_total_reward = self.total_reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #3
0
# training
for i_episode in range(num_episodes):

    #state0 #获得一个初始化状态

    for t in count():
        # 选择动作
        action = e_greedy_select_action(state0)
        print("action selected by e_greedy is {}".format(action))
        # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
        state1, done, flag = transition_function(state0, action)
        # 利用奖励函数,获得当前的奖励值
        reward, cost_migration = reward_function(state0, action, state1, flag)
        # 将经验数据存储至buffer中
        buffer.add(state0, action, reward, state1, done)

        # exit an episode after MAX_T steps
        if t > MAX_T:
            break

        #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
        if i_episode > 10:

            # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
            batch = buffer.getBatch(BATCH_SIZE)

            policy_net, target_net = optimize_model(batch, policy_net,
                                                    target_net,
                                                    optimizer_policy,
                                                    criterion)
예제 #4
0
class maddpg():
    """Interacts with and learns from the environment."""
    def __init__(self, env, config):
        """Initialize an Agent object.
        
        Params
        ======
            env : environment to be handled
            config : configuration given a variety of parameters
        """

        self.env = env
        self.config = config
        # self.seed = (config['seed'])

        # set parameter for ML
        self.set_parameters(config)
        # Replay memory
        self.memory = ReplayBuffer(config)
        # Q-Network
        self.create_agents(config)
        # load agent
        if self.load_model:
            self.load_agent('trained_tennis_2k86.pth')

    def set_parameters(self, config):
        # Base agent parameters
        self.gamma = config['gamma']  # discount factor
        self.tau = config['tau']
        self.max_episodes = config[
            'max_episodes']  # max numbers of episdoes to train
        self.env_file_name = config[
            'env_file_name']  # name and path for env app
        self.brain_name = config[
            'brain_name']  # name for env brain used in step
        self.train_mode = config['train_mode']
        self.load_model = config['load_model']
        self.save_model = config['save_model']
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        self.hidden_size = config['hidden_size']
        self.buffer_size = config['buffer_size']
        self.batch_size = config['batch_size']
        self.learn_every = config['learn_every']
        self.learn_num = config['learn_num']
        self.critic_learning_rate = config['critic_learning_rate']
        self.actor_learning_rate = config['actor_learning_rate']
        self.noise_decay = config['noise_decay']
        self.seed = (config['seed'])
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)
        self.noise_scale = 1
        self.results = struct_class()
        # Some Debug flags
        self.debug_show_memory_summary = False

    def create_agents(self, config):
        self.maddpg_agent = [ddpg_agent(config), ddpg_agent(config)]

        for a_i in range(self.num_agents):
            self.maddpg_agent[a_i].id = a_i

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        # print('Step adding types') # : ,states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        actions = np.reshape(actions, (1, 2 * self.action_size))
        self.memory.add(states, actions, rewards, next_states, dones)

    def act(self, state):
        """Returns actions for given state as per current policy 
        shuold only get single or single joined states from train"""
        state = ten(state)
        actions = np.vstack([agent.act(state) for agent in self.maddpg_agent])
        return actions

    def actor_target(self, states):
        """Returns actions for given state as per current target_policy without noise.
           should only get batch_size states from learn"""
        actions = np.hstack([agent.act(states) for agent in self.maddpg_agent])
        return ten(actions)

    def init_results(self):
        """ Keeping different results in list in self.results, being initializd here"""
        self.results.reward_window = deque(maxlen=100)
        self.results.all_rewards = []
        self.results.avg_rewards = []
        self.results.critic_loss = []
        self.results.actor_loss = []

    def episode_reset(self, i_episode):
        self.noise_reset()
        self.episode = i_episode
        self.noise_scale *= self.noise_decay
        for agent in self.maddpg_agent:
            agent.noise_scale = self.noise_scale
            agent.episode = self.episode

    def noise_reset(self):
        for agent in self.maddpg_agent:
            agent.noise.reset()

    def train(self):
        print('Running on device : ', device)
        # if False:
        #     filename = 'trained_reacher_a_e100.pth'
        #     self.load_agent(filename)
        self.init_results()
        # training loop
        # show progressbar
        widget = [
            'episode: ',
            pb.Counter(), '/',
            str(self.max_episodes), ' ',
            pb.Percentage(), ' ',
            pb.ETA(), ' ',
            pb.Bar(marker=pb.RotatingMarker()), ' '
        ]

        timer = pb.ProgressBar(widgets=widget,
                               maxval=self.max_episodes).start()

        for i_episode in range(self.max_episodes):
            timer.update(i_episode)
            tic = time.time()

            # per episode resets
            self.episode_reset(i_episode)
            total_reward = np.zeros(self.num_agents)
            # Reset the enviroment
            env_info = self.env.reset(
                train_mode=self.train_mode)[self.brain_name]
            states = self.get_states(env_info)
            t = 0
            dones = np.zeros(self.num_agents, dtype=bool)

            # loop over episode time steps
            while not any(dones):
                # act and collect data
                actions = self.act(states)
                env_info = self.env.step(actions)[self.brain_name]
                next_states = self.get_states(env_info)
                rewards = env_info.rewards
                dones = env_info.local_done
                # increment stuff
                t += 1
                total_reward += rewards
                # np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
                # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,actions,rewards,dones))
                # Proceed agent step
                self.step(states, actions, rewards, next_states, dones)
                # prepare for next round
                states = next_states
            #:while not done
            # Learn, if enough samples are available in memory
            if (i_episode % self.learn_every == 0):
                if len(self.memory) > self.batch_size:
                    for l in range(self.learn_num):
                        experiences = self.memory.sample()
                        self.learn(experiences)

            toc = time.time()
            # keep track of rewards:
            self.results.all_rewards.append(total_reward)
            self.results.avg_rewards.append(np.mean(
                self.results.reward_window))
            self.results.reward_window.append(np.max(total_reward))
            # Output Episode info :
            self.print_episode_info(total_reward, t, tic, toc)
        # for i_episode

        if self.save_model:
            filename = 'trained_tennis' + str(self.seed) + '.pth'
            self.save_agent(filename)

        return self.results

    def get_states(self, env_info):
        return np.reshape(env_info.vector_observations,
                          (1, 2 * self.state_size))

    def print_episode_info(self, total_reward, num_steps, tic, toc):
        if (self.episode % 20 == 0) or (np.max(total_reward) > 0.01):
            if np.max(total_reward) > 0.01:
                if np.sum(total_reward) > 0.15:
                    if np.sum(total_reward) > 0.25:
                        StyleString = Back.GREEN
                        print('Double Hit')
                    else:
                        StyleString = Back.BLUE
                else:
                    StyleString = Back.RED
            else:
                StyleString = ''
            print(
                StyleString +
                'Episode {} with {} steps || Reward : {} || avg reward : {:6.3f} || Noise {:6.3f} || {:5.3f} seconds, mem : {}'
                .format(self.episode, num_steps, total_reward,
                        np.mean(self.results.reward_window), self.noise_scale,
                        toc - tic, len(self.memory)))
            print(Style.RESET_ALL, end='')

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        q_target = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value """

        states, actions, rewards, next_states, dones = experiences
        # print('Learning shape : ',states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape)
        # print('Learning state & reward shape : ',states[0].shape,rewards[0].shape)

        actor_loss = []
        critic_loss = []
        both_next_actions = self.actor_target(next_states)

        # print('Learn both',both_next_actions.shape)
        for agent in self.maddpg_agent:
            # In case of joined_states, we want actions_next from both agents for learning
            al, cl = agent.learn(states, actions, rewards, next_states,
                                 both_next_actions, dones)
            actor_loss.append(al)
            critic_loss.append(cl)

        self.results.actor_loss.append(actor_loss)
        self.results.critic_loss.append(critic_loss)

    def save_agent(self, filename):
        states, actions, rewards, next_states, dones = self.memory.save_buffer(
        )
        print('save agent : ', states.shape, actions.shape, rewards.shape,
              next_states.shape, dones.shape)
        torch.save(
            {
                'critic_local0':
                self.maddpg_agent[0].critic_local.state_dict(),
                'critic_target0':
                self.maddpg_agent[0].critic_target.state_dict(),
                'actor_local0': self.maddpg_agent[0].actor_local.state_dict(),
                'actor_target0':
                self.maddpg_agent[0].actor_target.state_dict(),
                'critic_local1':
                self.maddpg_agent[1].critic_local.state_dict(),
                'critic_target1':
                self.maddpg_agent[1].critic_target.state_dict(),
                'actor_local1': self.maddpg_agent[1].actor_local.state_dict(),
                'actor_target1':
                self.maddpg_agent[1].actor_target.state_dict(),
                'memory': (states, actions, rewards, next_states, dones),
            }, filename)
        print('Saved Networks and ER-memory in ', filename)
        return

    def load_agent(self, filename):
        savedata = torch.load(filename)
        self.maddpg_agent[0].critic_local.load_state_dict(
            savedata['critic_local0'])
        self.maddpg_agent[0].critic_target.load_state_dict(
            savedata['critic_target0'])
        self.maddpg_agent[0].actor_local.load_state_dict(
            savedata['actor_local0'])
        self.maddpg_agent[0].actor_target.load_state_dict(
            savedata['actor_target0'])
        self.maddpg_agent[1].critic_local.load_state_dict(
            savedata['critic_local1'])
        self.maddpg_agent[1].critic_target.load_state_dict(
            savedata['critic_target1'])
        self.maddpg_agent[1].actor_local.load_state_dict(
            savedata['actor_local1'])
        self.maddpg_agent[1].actor_target.load_state_dict(
            savedata['actor_target1'])
        states, actions, rewards, next_states, dones = savedata['memory']
        self.memory.load_buffer(states, actions, rewards, next_states, dones)
        print('Memory loaded with length : ', len(self.memory))
        return