コード例 #1
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim, env):
        self.name = 'DDPG'  # name for uploading results
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.environment = env
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return self.time_step
コード例 #2
0
ファイル: ddpg.py プロジェクト: nunuNNN/Renforcement-robot
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')

    def train(self):
        # if self.time_step ==0:
        #     print("Begins Training!!!")
        #print("Training Begins")
        self.time_step += 1
        """Sample a random minibatch of N transitions from replay buffer"""
        """take out BATCH_SIZE sets of data"""
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        """resize the action_batch shape to  [BATCH_SIZE, self.action_dim]"""
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        """Calculate y_batch(reward)"""
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        """Update critic by minimizing the loss L (training)"""
        self.critic_network.train(y_batch, state_batch, action_batch)
        """Update the actor policy using the sampled gradient:"""
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)
        """Update the target networks"""
        self.actor_network.update_target()
        self.critic_network.update_target()
        #print("Training Finished")

    def noise_action(self, state):
        """Select action a_t according to the current policy and exploration noise"""
        action = self.actor_network.action(state)
        exp_noise = self.exploration_noise.noise()
        action += exp_noise
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def action(self, state):
        action = self.actor_network.action(state)
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def perceive(self, state, action, reward, next_state, done):
        """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
        """Store transitions to replay start size then start training"""
        # if self.replay_buffer.count() % 1000 == 0:
        #     print("The buffer count is ", self.replay_buffer.count())
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.atten_rate *= 0.99995
            if not self.threading.is_alive():
                self.threading = threading.Thread(target=self.train,
                                                  name='LoopThread--DDPG')
                self.threading.start()
            """SAVE NETWORK"""
            if self.time_step % 100 == 0:
                print("Training_time_step:", self.time_step)
            if self.time_step % 1000 == 0:
                print("!!!!!!!save model success!!!!!!!!")
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)
        """Re-iniitialize the random process when an episode ends"""
        if done:
            self.exploration_noise.reset()
コード例 #3
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def action(self, state):
        action = self.actor_network.action(state)

        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        return self.time_step
コード例 #4
0
class DDPG:
    """docstring for DDPG"""


    def __init__(self, a_dim, s_dim):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = s_dim
        self.action_dim = a_dim
        self.time_step=0
        self.max_bw = 0.0
        self.max_cwnd = 0.0
        self.min_rtt = 9999999.0

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def learn(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        self.time_step += 1
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise()
        # print("noise:" + str(noise))
        return action + noise

    def choose_action(self, state):
        self.time_step += 1
        # print("_______________________choose_action_____________________")
        action = self.actor_network.action(state)
        return action

    def store_transition(self, s, a, r, s_,done,episode_count):

        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # print("*********************************ADD****************************")
        self.replay_buffer.add(s, a, r, s_, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            if((episode_count+1)%100!= 0):
                self.learn()
                # print("learn!")
            else:
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)


        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def extract_observation(self,dataRecorder,subflow_index,state_before):
        # print("extracting...")
        value_dic = dataRecorder.get_latest_data()
        state_after=state_before.reshape(10,5)
        # observation = np.zeros((4))
        observation = np.zeros((5))
        t_cWnd=[0,0]
        t_thr=[0,0]
        t_rtt=[0,0]
        t_loss_rate=[0,0]
        t_unAck=[0,0]
        s0=[0,0,0,0,0]
        state=np.zeros(1)
        for i in range(value_dic["nbOfSubflows"]):
            name = "cWnd" + str(i)
            t_cWnd[i] = value_dic[name]
            name = "rtt"+str(i)
            t_rtt[i] = value_dic[name]
            name = "unAck" + str(i)
            t_unAck[i]=value_dic[name]
            name = "loss_rate" + str(i)
            t_loss_rate[i]=value_dic[name]
            name = "throughput" + str(i)
            t_thr[i]=value_dic[name]

        thr=t_thr[subflow_index]
        s0[0]=t_thr[subflow_index]

        rtt=t_rtt[subflow_index]
        s0[1]=t_rtt[subflow_index]

        cwnd=t_cWnd[subflow_index]
        s0[2]=t_cWnd[subflow_index]

        loss_rate=t_loss_rate[subflow_index]
        s0[3]=t_loss_rate[subflow_index]

        unAck=t_unAck[subflow_index]
        s0[4]=t_unAck[subflow_index]


        s0=np.array(s0)
        min_=s0-s0

        thr_n=s0[0]
        thr_n_min=s0[0]-min_[0]
        rtt_min=s0[1]-min_[1]
        cwnd_n_min=s0[2]-min_[2]
        loss_rate_n_min=s0[3]-min_[3]
        unAck_n_min=s0[4]-min_[4]

        # loss_rate_n_min=s0[7]-min_[7]

        if self.max_bw<thr_n_min:
            self.max_bw=thr_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.min_rtt>rtt_min:
            self.min_rtt=rtt_min

        
        reward  = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min
        print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+"  delta_rtt"+str(rtt_min-self.min_rtt))
        # print("unAck:"+str(unAck_n_min))
        if self.max_bw!=0:
            state[0]=thr_n_min/self.max_bw
            # tmp=pacing_rate_n_min/self.max_bw
            state=np.append(state,[5*loss_rate_n_min])
            state=np.append(state,[unAck_n_min])
        else:
            state[0]=0
            state=np.append(state,[0])
            state=np.append(state,[0])
        state=np.append(state,[1400/cwnd])
        state=np.append(state,[self.min_rtt/rtt_min])

        state_after=np.delete(state_after,[0],axis = 0)
        state_after=np.append(state_after,state)
        

        return state_after,reward,thr_n_min,rtt_min