owned_squares, current_states = window.get_windows(game_map.contents, myID)

    moves = proto.get_action(current_states)

    moves = moves.numpy().tolist()
    moves = [Move(square, move) for square, move in zip(owned_squares, moves)]

    hlt.send_frame(moves)
    game_map.get_frame()

    new_states = window.get_windows_for_squares(game_map.contents,
                                                owned_squares)

    #rewards = [reward.reward(s) for s in new_states]

    rewards = [
        reward.reward2(current_states[i], new_states[i])
        for i in range(len(owned_squares))
    ]

    #logging.debug(rewards)

    tuples = zip(current_states, moves, rewards, new_states)

    r.add_tuples(tuples)

    if len(r) >= BATCH_SIZE:
        proto.train2(r.get_batch(BATCH_SIZE))

    EPSILON *= 0.99
Exemple #2
0
class DDPG(nn.Module):
    def __init__(
        self,
        state_dim,
        action_dim,
        learning_rate_a=1e-3,
        learning_rate_c=1e-3,
        gamma=0.99,
        update_tau=1e-3,
        batch_size=100,
        buffer_size=10000,
        training_start=1000,
    ):
        super(DDPG, self).__init__()
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr_a = learning_rate_a
        self.lr_c = learning_rate_c
        self.gamma = gamma
        self.update_tau = update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.training_start = training_start
        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.actor = Actor(input_dim=self.s_dim,
                           output_dim=self.a_dim,
                           update_tau=self.update_tau).to(self.device)
        self.critic = Critic(state_dim=self.s_dim,
                             action_dim=self.a_dim,
                             update_tau=self.update_tau).to(self.device)
        self.buffer = ReplayBuffer(buffer_size=self.buffer_size)

        self.loss_actor = 0
        self.loss_critic = 0
        self.optimizer_a = optim.Adam(self.actor.eval_net.parameters(),
                                      lr=self.lr_a)
        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c)

    def choose_action(self, s):
        s = torch.Tensor(s).to(self.device)
        return self.actor.get_eval(s).to(
            torch.device('cpu')).detach().numpy().tolist()

    def percive(self, state, action, reward, state_, done):
        self.buffer.add(state, action, reward, state_, done)
        if self.training_start < self.buffer.count():
            self.Train()

    def get_critic_loss(self, reward, state_next, state, action, done):
        action_next = self.actor.get_target(state_next)
        q_next_tar = self.critic.get_target(s=state_next, a=action_next)
        Q_target = reward + self.gamma * q_next_tar * (1 - done)
        Q_eval = self.critic.get_eval(s=state, a=action)
        return F.mse_loss(Q_target, Q_eval)

    def Train(self):
        minibatch = self.buffer.get_batch(batch_size=self.batch_size)
        state_batch = torch.Tensor([data[0]
                                    for data in minibatch]).to(self.device)
        action_batch = torch.Tensor([data[1]
                                     for data in minibatch]).to(self.device)
        reward_batch = torch.Tensor([data[2]
                                     for data in minibatch]).to(self.device)
        state_next_batch = torch.Tensor([data[3] for data in minibatch
                                         ]).to(self.device)
        done_batch = torch.Tensor([data[4]
                                   for data in minibatch]).to(self.device)

        #train critic
        self.loss_critic = self.get_critic_loss(reward_batch, state_next_batch,
                                                state_batch, action_batch,
                                                done_batch)
        self.optimizer_c.zero_grad()
        self.loss_critic.backward()
        self.optimizer_c.step()

        #train actor
        self.loss_actor = -self.critic.get_eval(state_batch,
                                                action_batch).mean()
        self.optimizer_a.zero_grad()
        self.loss_actor.backward()
        self.optimizer_a.step()

        #update the target net
        self.actor.soft_update()
        self.critic.soft_update()
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_space, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.sess = tf.Session()

        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_space = state_space
        self.action_dim = action_dim  # 1

        self.ac_network = ActorCriticNetwork(self.sess, self.state_space,
                                             self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Get Q target label
        # maxQ(s',a')
        q_value_batch = self.ac_network.target_q(next_state_batch)

        # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a')
        y_batch = []
        batch_size = len(minibatch)
        for i in range(batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [batch_size, 1])

        # Update eval critic network by minimizing the loss L
        cost = self.ac_network.train_critic(y_batch, state_batch, action_batch)
        print('step_%d critic cost:' % self.ac_network.time_step, cost)

        # Update eval actor policy using the sampled gradient:
        self.ac_network.train_actor(state_batch)

        # Update the target networks
        self.ac_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.ac_network.actions(state)
        return action[0] + self.exploration_noise.noise()

    def action(self, state):
        action = self.ac_network.actions([state])
        return action[0]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def sparse_tensor(self, state_batch, state_space):
        row = len(state_batch)
        indices = []
        for r in range(row):
            indices += [(r, c) for c in state_batch[r]]
        values = [1.0 for i in range(len(indices))]
        return tf.SparseTensorValue(indices=indices,
                                    values=values,
                                    dense_shape=[row, state_space])
Exemple #4
0
class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))
def main(args):
    if VERBOSE:
        print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.'

    state_dim = 3
    action_dim = 1

    net = ActorCriticNet(state_dim, action_dim)

    target_net = copy.deepcopy(net)
    memory = ReplayBuffer(REPLAY_BUFFER_SIZE)
    noise = OUNoise(action_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2)
    target_optim = optim.Optimizer(target_net.parameters(),
                                   {})  # to iterate over target params

    if VERBOSE: print '***Making gym env (only used to setup TF net).'

    # load tf net (restoring saved parameters)
    dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')),
                          loadfilename='tf_params-0',
                          printVars=False)

    if VERBOSE: print '***TF net restore complete.'

    # load control data (only using a every fourth data), and tf net results
    control_states = np.load('control_states.npy')[::4]
    control_rewards = np.load('control_rewards.npy')[::4]
    tf_record = np.load('tf_control_record.npy')

    # replace torch params with tf params, and run control data, collecting torch net results
    # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch
    # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences)
    replaceNetParams(dtf, net, target_net)

    if VERBOSE: print '***Torch net params initialized to TF net params.'

    original_net = copy.deepcopy(net)  # save original net
    original_target_net = copy.deepcopy(target_net)

    torch_record = []

    loss = -1
    first_step = True

    for i in xrange(len(control_rewards) - 1):
        state = torch.from_numpy(control_states[i].reshape(1,
                                                           state_dim)).float()
        action = net.getAction(Variable(state)).data
        target_action = target_net.getAction(Variable(state)).data

        reward = torch.FloatTensor([[control_rewards[i]]]).float()

        new_state = torch.from_numpy(control_states[i + 1].reshape(
            1, state_dim)).float()

        memory.add(state, action, reward, new_state, True)
        if memory.count() > REPLAY_START_SIZE:
            minibatch = memory.get_batch(BATCH_SIZE)
            state_batch = torch.cat([data[0] for data in minibatch], dim=0)
            action_batch = torch.cat([data[1] for data in minibatch], dim=0)
            reward_batch = torch.cat([data[2] for data in minibatch])
            next_state_batch = torch.cat([data[3] for data in minibatch],
                                         dim=0)
            done_batch = Tensor([data[4] for data in minibatch])

            # calculate y_batch from targets
            #next_action_batch = target_net.getAction(Variable(next_state_batch))
            value_batch = target_net.getValue(Variable(next_state_batch)).data
            y_batch = reward_batch + GAMMA * value_batch * done_batch

            if first_step:
                if VERBOSE: print '***First Optimization Step complete.'
                torch_ys = y_batch
                torch_batch = minibatch
                torch_outs = net.getValue(Variable(state_batch)).data

            # optimize net 1 step
            loss = criterion(net.getValue(Variable(state_batch)),
                             Variable(y_batch))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss = loss.data[0]

            # update targets - using exponential moving averages
            for group, target_group in zip(optimizer.param_groups,
                                           target_optim.param_groups):
                for param, target_param in zip(group['params'],
                                               target_group['params']):
                    target_param.data.mul_(1 - TAU)
                    target_param.data.add_(TAU, param.data)

            if first_step:
                first_step_net = copy.deepcopy(net)
                first_step_target_net = copy.deepcopy(target_net)
                first_step = False

        torch_record.append(
            [action.numpy()[0][0],
             target_action.numpy()[0][0], loss])
        loss = -1

    torch_record = np.array(torch_record)
    torch_outs = torch_outs.numpy().T[0]
    torch_ys = torch_ys.numpy().T[0]

    if VERBOSE: print '***Control Data run complete.'

    # compare torch and tf results
    # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)]
    sel = np.arange(45, 55)
    #print calc_error(tf_record[sel,:], torch_record[sel,:])
    print 'Result comparison:'
    print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss'
    print np.hstack(
        [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]])
    print '\t(a loss of -1 means no training occured in that step)'

    # load all tf results from before taking first optimization step
    tf_ys = np.load('tf_first_step_y_batch.npy')
    tf_rs = np.load('tf_first_step_reward_batch.npy')
    tf_ds = np.load('tf_first_step_done_batch.npy')
    tf_vs = np.load('tf_first_step_value_batch.npy')
    tf_outs = np.load('tf_first_step_output_values.npy')
    torch_wd = 1.36607  # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net

    if VERBOSE:
        print '***Comparing first step stats'

        # compare tf and torch data from before taking first optimization step
        # including calculation of manual loss
        print '\terror in ys (between tf and torch)', calc_error(
            torch_ys, tf_ys)
        print '\terror in predictions (between tf and torch)', calc_error(
            torch_outs, tf_outs)
        print '\ttorch loss (manually calculated)', np.mean(
            (torch_ys - torch_outs)**2)
        print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2)
        print '\ttorch loss', torch_record[50,
                                           2], '(not including weight decay)'
        print '\ttf loss', tf_record[
            50, 2] - torch_wd, '(not including weight decay)'

    return 0
Exemple #6
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        mx.random.seed(seed)
        np.random.seed(seed)
        self.env = env
        if flg_gpu:
            self.ctx = mx.gpu(0)
        else:
            self.ctx = mx.cpu()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.ddpgnet = DDPGNet(self.state_dim, self.action_dim)
        self.exploration_noise = OUNoise(self.action_dim)
        self.replay_buffer = ReplayBuffer(memory_size)

        self.batch_size = batch_size

        self.ddpgnet.init()
        self.train_step = 0

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(self.batch_size)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,
                                 [self.batch_size, self.action_dim])

        # Calculate y_batch
        next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy()

        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0])
        y_batch = np.resize(y_batch, [self.batch_size, 1])

        # Update critic by minimizing the loss L
        self.ddpgnet.update_critic(state_batch, action_batch, y_batch)

        # Update actor by maxmizing Q
        self.ddpgnet.update_actor(state_batch)

        self.train_step += 1
        # update target networks
        self.ddpgnet.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > memory_start_size:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #7
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')

    def train(self):
        # if self.time_step ==0:
        #     print("Begins Training!!!")
        #print("Training Begins")
        self.time_step += 1
        """Sample a random minibatch of N transitions from replay buffer"""
        """take out BATCH_SIZE sets of data"""
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        """resize the action_batch shape to  [BATCH_SIZE, self.action_dim]"""
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        """Calculate y_batch(reward)"""
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        """Update critic by minimizing the loss L (training)"""
        self.critic_network.train(y_batch, state_batch, action_batch)
        """Update the actor policy using the sampled gradient:"""
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)
        """Update the target networks"""
        self.actor_network.update_target()
        self.critic_network.update_target()
        #print("Training Finished")

    def noise_action(self, state):
        """Select action a_t according to the current policy and exploration noise"""
        action = self.actor_network.action(state)
        exp_noise = self.exploration_noise.noise()
        action += exp_noise
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def action(self, state):
        action = self.actor_network.action(state)
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def perceive(self, state, action, reward, next_state, done):
        """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
        """Store transitions to replay start size then start training"""
        # if self.replay_buffer.count() % 1000 == 0:
        #     print("The buffer count is ", self.replay_buffer.count())
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.atten_rate *= 0.99995
            if not self.threading.is_alive():
                self.threading = threading.Thread(target=self.train,
                                                  name='LoopThread--DDPG')
                self.threading.start()
            """SAVE NETWORK"""
            if self.time_step % 100 == 0:
                print("Training_time_step:", self.time_step)
            if self.time_step % 1000 == 0:
                print("!!!!!!!save model success!!!!!!!!")
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)
        """Re-iniitialize the random process when an episode ends"""
        if done:
            self.exploration_noise.reset()
Exemple #8
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN)
        print("######### TRAINING   #############")
        for k in range(Hp.N_TRAIN):
            minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size]
            state_batch_r = np.asarray([data[0] for data in minibatch])
            state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(state_batch_r[:, j], axis=0)
                state_batch.append(new_cat)
            #state_batch = [np.expand_dims(state_batch, axis=1)]
            action_batch = np.asarray([data[1] for data in minibatch])
            reward_batch = np.asarray([data[2] for data in minibatch])
            next_state_batch_r = np.asarray([data[3] for data in minibatch])
            next_state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(next_state_batch_r[:, j], axis=0)
                next_state_batch.append(new_cat)
            #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
            done_batch = np.asarray([data[4] for data in minibatch])

            # for action_dim = 1
            action_batch = np.resize(action_batch,
                                     [Hp.batch_size, self.action_dim])

            next_action_batch = self.actor_network.target_actions(
                self.target_state_input, next_state_batch)
            q_value_batch = self.critic_network.target_q(
                self.target_state_input, next_state_batch, next_action_batch)
            y_batch = []

            for i in range(len(minibatch)):
                if done_batch[i]:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   Hp.GAMMA * q_value_batch[i])

            y_batch = np.resize(y_batch, [Hp.batch_size, 1])

            # Update critic by minimizing the loss L
            self.critic_network.train(y_batch, self.state_input, state_batch,
                                      action_batch)

            # Update the actor policy using the sampled gradient:
            action_batch_for_gradients = self.actor_network.actions(
                self.state_input, state_batch)
            q_gradient_batch = self.critic_network.gradients(
                self.state_input, state_batch, action_batch_for_gradients)

            self.summary_str2 = self.actor_network.train(
                q_gradient_batch, self.state_input, state_batch)

            # Update the target networks
            self.actor_network.update_target()
            self.critic_network.update_target()
            self.state_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        print("no noise ", action)
        return np.clip(
            action +
            self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]),
            [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0])

    def action(self, state):
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > Hp.REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #9
0
    ]

    hlt.send_frame(moves)
    game_map.get_frame()

    new_targets = window.get_targets(game_map, owned_squares, directions)

    done = [int(t.owner == id) for t in new_targets]

    new_states = window.prepare_for_input(game_map, new_targets, myID)

    rewards = reward.reward(owned_squares, old_targets, new_targets, myID)

    #logging.debug(rewards)

    for i in range(len(owned_squares)):

        r.add(old_states[i], directions[i], rewards[i], new_states[i], done[i])

    if len(r) >= BATCH_SIZE:
        batch = r.get_batch(BATCH_SIZE)

        loss, rewar = model.train(batch)

        writer.save_progress(tm.content["timesteps"], loss, rewar)

    #if(timestep % 10 == 0):
    #logging.debug(model.trainable_variables[0])

    tm.content["timesteps"] += 1
Exemple #10
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step
Exemple #11
0
class DDQN:
    def __init__(self, model_name, action_dim):
        self.device = configure.DEVICE
        self.model_name = model_name
        self.action_dim = action_dim
        self.episode = 0
        # self.timeStep = 0
        self.STARTtrain = False
        self.epsilon = INITIAL_EPSILON

        self.img_width = configure.IMAGE_WIDTH
        self.img_height = configure.IMAGE_HEIGHT
        self.img_channels = configure.STACKED_FRAMES * 4

        self.learning_rate = configure.LEARNING_RATE_START
        self.tau = configure.TargetNet_Tau

        self.replaybuffer = ReplayBuffer(REPLAY_MEMORY)

        self.graph = tf.Graph()
        with self.graph.as_default() as g:
            with tf.device(self.device):
                with tf.variable_scope('Main_net'):
                    self.imageIn, self.conv1, self.conv2, self.conv3, self.pool1, self.conv4, \
                    self.Advantage, self.Value, self.Qout, self.predict \
                        = self.__create_graph()

                with tf.variable_scope('Target_net'):
                    self.imageInT, _, _, _, _, _, _, _, self.QoutT, _ = self.__create_graph(
                    )

                self.MainNet_vars = get_variables('Main_net')
                self.TargetNet_vars = get_variables('Target_net')
                self.createTrainingMethod()
                self.createupdateTargetNetOp()

                self.sess = tf.Session(
                    graph=self.graph,
                    config=tf.ConfigProto(
                        allow_soft_placement=True,
                        log_device_placement=False,
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                self.sess.run(tf.global_variables_initializer())

                if configure.TENSORBOARD:
                    self._create_tensor_board()
                # if configure.LOAD_CHECKPOINT or configure.SAVE_MODELS:
                #     vars = tf.global_variables()
                #     self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0)

                self.saver = tf.train.Saver()

                checkpoint = tf.train.get_checkpoint_state(self.model_name)
                if checkpoint and checkpoint.model_checkpoint_path:
                    self.saver.restore(self.sess,
                                       checkpoint.model_checkpoint_path)
                    print "Successfully loaded:", checkpoint.model_checkpoint_path
                    mypath = str(checkpoint.model_checkpoint_path)
                    stepmatch = re.split('-', mypath)[2]
                    self.episode = int(stepmatch)
                # pass
                else:
                    print "Could not find old network weights"

    # def __create_main_graph(self):
    #     self.imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn')
    #
    #     self.conv1 = self.conv2d_layer(self.imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1])
    #     self.conv2 = self.conv2d_layer(self.conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1])
    #     self.conv3 = self.conv2d_layer(self.conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
    #     self.conv4 = self.conv2d_layer(self.conv3, self.conv3.get_shape()[1].value, 512, 'conv4', strides=[1,1,1,1])
    #     with tf.variable_scope('A_V'):
    #         self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3)
    #         self.streamA = tf.contrib.layers.flatten(self.streamAC)
    #         self.streamV = tf.contrib.layers.flatten(self.streamVC)
    #
    #         self.AW = tf.Variable(tf.random_normal([self.streamA, self.action_dim]), name='AW')
    #         self.VW = tf.Variable(tf.random_normal([self.streamV, 1]), name='VW')
    #         self.Advantage = tf.matmul(self.streamA, self.AW, name='Advantage')
    #         self.Value = tf.matmul(self.streamV, self.VW, name='Value')
    #
    #     with tf.variable_scope('Qout'):
    #         self.Qout = self.Value + tf.subtract(
    #             self.Advantage, tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True))
    #
    #     with tf.variable_scope('Predict'):
    #         self.predict = tf.argmax(self.Qout, 1)

    def __create_graph(self):
        imageIn = tf.placeholder(
            tf.float32,
            [None, self.img_height, self.img_width, self.img_channels],
            name='imgIn')

        conv1 = self.conv2d_layer(imageIn,
                                  8,
                                  128,
                                  'conv1',
                                  strides=[1, 4, 4, 1])
        conv2 = self.conv2d_layer(conv1, 4, 128, 'conv2', strides=[1, 2, 2, 1])
        conv3 = self.conv2d_layer(conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
        pool1 = self.mpool_layer(conv3, 2, [1, 2, 2, 1], name='pool1')
        conv4 = self.conv2d_layer(pool1,
                                  pool1.get_shape()[1].value,
                                  1024,
                                  'conv4',
                                  strides=[1, 1, 1, 1],
                                  padding='VALID')

        streamAC, streamVC = tf.split(conv4, 2, 3)
        streamA = tf.contrib.layers.flatten(streamAC)
        streamV = tf.contrib.layers.flatten(streamVC)

        Advantage = self.fc_layer(streamA,
                                  self.action_dim,
                                  'Advantage',
                                  func=None)
        Value = self.fc_layer(streamV, 1, 'Value', func=None)

        # AW = tf.Variable(tf.random_normal([streamA.get_shape()[1].value, self.action_dim]), name='AW')
        # VW = tf.Variable(tf.random_normal([streamV.get_shape()[1].value, 1]), name='VW')
        # Advantage = tf.matmul(streamA, AW, name='Advantage')
        # Value = tf.matmul(streamV, VW, name='Value')
        with tf.variable_scope('Qout'):
            Qout = Value + tf.subtract(
                Advantage,
                tf.reduce_mean(Advantage, reduction_indices=1, keep_dims=True))
        with tf.variable_scope('Predict'):
            predict = tf.argmax(Qout, 1)

        return imageIn, conv1, conv2, conv3, pool1, conv4, Advantage, Value, Qout, predict

    # def __create_target_graph(self):
    #     self.target_imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels],
    #                                   name='imgIn')
    #     self.target_conv1 = self.conv2d_layer(self.target_imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1])
    #     self.target_conv2 = self.conv2d_layer(self.target_conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1])
    #     self.target_conv3 = self.conv2d_layer(self.target_conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
    #     self.target_conv4 = self.conv2d_layer(self.target_conv3, self.target_conv3.get_shape()[1].value, 512, 'conv4', strides=[1, 1, 1, 1])
    #     with tf.variable_scope('A_V'):
    #         self.target_streamAC, self.target_streamVC = tf.split(self.target_conv4, 2, 3)
    #         self.target_streamA = tf.contrib.layers.flatten(self.target_streamAC)
    #         self.target_streamV = tf.contrib.layers.flatten(self.target_streamVC)
    #
    #         self.target_AW = tf.Variable(tf.random_normal([self.target_streamA, self.action_dim]), name='AW')
    #         self.target_VW = tf.Variable(tf.random_normal([self.target_streamV, 1]), name='VW')
    #         self.target_Advantage = tf.matmul(self.target_streamA, self.target_AW, name='Advantage')
    #         self.target_Value = tf.matmul(self.target_streamV, self.target_VW, name='Value')
    #
    #     with tf.variable_scope('Qout'):
    #         self.Qout = self.target_Value + tf.subtract(
    #             self.target_Advantage, tf.reduce_mean(self.target_Advantage, reduction_indices=1, keep_dims=True))

    def createTrainingMethod(self):
        self.global_step = tf.Variable(0, trainable=False, name='step')
        self.var_learning_rate = tf.placeholder(tf.float32,
                                                name='lr',
                                                shape=[])
        self.targetQ = tf.placeholder(shape=[None],
                                      dtype=tf.float32,
                                      name='targetQ')
        self.actions = tf.placeholder(shape=[None],
                                      dtype=tf.int32,
                                      name='actions')
        self.actions_onehot = tf.one_hot(self.actions,
                                         self.action_dim,
                                         dtype=tf.float32,
                                         name='act_onehot')
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot),
                               reduction_indices=1,
                               name='Q')
        self.td_error = tf.square(self.targetQ - self.Q, name='td_error')
        self.loss = tf.reduce_mean(self.td_error, name='loss')
        self.trainer = tf.train.AdamOptimizer(
            learning_rate=self.var_learning_rate)
        self.train_op = self.trainer.minimize(self.loss,
                                              global_step=self.global_step,
                                              name='train_update')

    def createupdateTargetNetOp(self):
        self.assign_op = {}
        for from_, to_ in zip(self.MainNet_vars, self.TargetNet_vars):
            self.assign_op[to_.name] = to_.assign(self.tau * from_ +
                                                  (1 - self.tau) * to_)

    def updateTargetNet(self):
        for var in self.TargetNet_vars:
            self.sess.run(self.assign_op[var.name])

    def conv2d_layer(self,
                     input,
                     filter_size,
                     out_dim,
                     name,
                     strides,
                     func=tf.nn.relu,
                     padding='SAME'):
        in_dim = input.get_shape()[-1].value
        # in_dim = input.get_shape()[-1].value
        d = 1.0 / np.sqrt(filter_size * filter_size * in_dim)
        with tf.variable_scope(name):
            w_init = tf.random_uniform_initializer(-d, d)
            b_init = tf.random_uniform_initializer(-d, d)
            w = tf.get_variable(
                'w',
                shape=[filter_size, filter_size, in_dim, out_dim],
                dtype=tf.float32,
                initializer=w_init)
            b = tf.get_variable('b', shape=[out_dim], initializer=b_init)

            output = tf.nn.conv2d(input, w, strides=strides,
                                  padding=padding) + b
            if func is not None:
                output = func(output)

        return output

    def mpool_layer(self, input_op, mpool_size, strides, name):
        with tf.variable_scope(name):
            output = tf.nn.max_pool(input_op,
                                    ksize=[1, mpool_size, mpool_size, 1],
                                    strides=strides,
                                    padding="SAME")
        return output

    def fc_layer(self, input, out_dim, name, func=tf.nn.relu):
        in_dim = input.get_shape()[-1].value
        d = 1.0 / np.sqrt(in_dim)
        with tf.variable_scope(name):
            w_init = tf.random_uniform_initializer(-d, d)
            b_init = tf.random_uniform_initializer(-d, d)
            w = tf.get_variable('w',
                                dtype=tf.float32,
                                shape=[in_dim, out_dim],
                                initializer=w_init)
            b = tf.get_variable('b',
                                dtype=tf.float32,
                                shape=[out_dim],
                                initializer=b_init)

            output = tf.matmul(input, w) + b
            if func is not None:
                output = func(output)

        return output

    def _create_tensor_board(self):
        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summaries.append(tf.summary.scalar("Loss", self.loss))
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram("W_%s" % var.name, var))

        summaries.append(tf.summary.histogram("conv1", self.conv1))
        summaries.append(tf.summary.histogram("conv2", self.conv2))
        summaries.append(tf.summary.histogram("conv3", self.conv3))
        summaries.append(tf.summary.histogram("pool1", self.pool1))
        summaries.append(tf.summary.histogram("conv4", self.conv4))
        summaries.append(tf.summary.histogram("Advantage", self.Advantage))
        summaries.append(tf.summary.histogram("Value", self.Value))
        summaries.append(tf.summary.histogram("Qout", self.Qout))
        summaries.append(tf.summary.histogram("Q", self.Q))

        self.summary_op = tf.summary.merge(summaries)
        self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name,
                                                self.sess.graph)

    def log(self, y_batch, action_batch, state_batch):
        feed_dict = {
            self.targetQ: y_batch,
            self.actions: action_batch,
            self.imageIn: state_batch,
            self.var_learning_rate: self.learning_rate
        }
        step, summary = self.sess.run([self.global_step, self.summary_op],
                                      feed_dict=feed_dict)
        self.log_writer.add_summary(summary, step)

    def trainQNetwork(self):
        minibatch = self.replaybuffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        action_batch = np.resize(action_batch, [BATCH_SIZE])

        A = self.sess.run(self.predict,
                          feed_dict={self.imageIn: next_state_batch})
        Q = self.sess.run(self.QoutT,
                          feed_dict={self.imageInT: next_state_batch})
        doubleQ = Q[range(BATCH_SIZE), A]
        targetQ = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                targetQ.append(reward_batch[i])
            else:
                targetQ.append(reward_batch[i] + GAMMA * doubleQ[i])
        # targetQ = np.resize(targetQ, [BATCH_SIZE, 1])
        self.sess.run(self.train_op,
                      feed_dict={
                          self.imageIn: state_batch,
                          self.targetQ: targetQ,
                          self.actions: action_batch,
                          self.var_learning_rate: self.learning_rate
                      })

        self.updateTargetNet()

        if self.episode % configure.SAVE_NET == 0 and self.episode != 0:
            self.saver.save(self.sess,
                            self.model_name + '/network' + '-dqn',
                            global_step=self.episode)

        if configure.TENSORBOARD and self.episode % configure.TENSORBOARD_UPDATE_FREQUENCY == 0 and self.episode != 0:
            self.log(targetQ, action_batch, state_batch)

        self.episode += 1
        self.STARTtrain = True

    def setPerception(self, nextObservation, action, reward, terminal):
        newState = np.concatenate(
            (self.currentState[:, :, 4:], nextObservation), axis=2)
        self.replaybuffer.add(self.currentState, action, reward, newState,
                              terminal)
        # self.replayMemory.append((self.currentState, action, reward, newState, terminal))
        if self.episode <= OBSERVE:
            state = "observe"
        elif self.episode > OBSERVE and self.episode <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        if self.episode % 100 == 0 and self.STARTtrain:
            print "episode", self.episode , "/ STATE", state, \
                "/ EPSILON", self.epsilon

        self.currentState = newState

    def Perce_Train(self):
        if self.replaybuffer.count() > configure.REPLAY_START_SIZE:
            self.trainQNetwork()

    def getAction(self):
        if np.random.rand(1) < self.epsilon:
            action_get = np.random.randint(0, self.action_dim)
        else:
            action_get = self.sess.run(
                self.predict, feed_dict={self.imageIn: [self.currentState]})

        if self.epsilon > FINAL_EPSILON and self.episode > OBSERVE:
            self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        return action_get

    def setInitState_rgb(self, observation):
        self.currentState = observation
        for i in xrange(configure.STACKED_FRAMES - 1):
            self.currentState = np.concatenate(
                (self.currentState, observation), axis=2)
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim):
        self.time_step = 1
        self.memory = ReplayBuffer(MEMORY_CAPACITY)
        self.exploration_noise = OUNoise(a_dim)
        self.pointer = 0
        self.sess = tf.Session()
        writer = tf.summary.FileWriter("logs/", self.sess.graph)

        self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.pixel_meter, self.att_dim = \
            a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'gm')
        self.LM = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm')
        self.LM_ = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm_')

        self.a = self._build_a(self.S, self.GM, self.LM, )
        q = self._build_c(self.S, self.GM, self.LM, self.a, )
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)  # soft replacement

        def ema_getter(getter, name, *args, **kwargs):
            return ema.average(getter(name, *args, **kwargs))

        target_update = [ema.apply(a_params), ema.apply(c_params)]  # soft update operation
        a_ = self._build_a(self.S_, self.GM, self.LM_, reuse=True, custom_getter=ema_getter)  # replaced target parameters
        q_ = self._build_c(self.S_, self.GM, self.LM_, a_, reuse=True, custom_getter=ema_getter)

        a_loss = - tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)

        with tf.control_dependencies(target_update):  # soft replacement happened at here
            q_target = self.R + GAMMA * q_
            td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
            self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)

        self.sess.run(tf.global_variables_initializer())

    def noise_action(self, s1, gm1, loc1):
        locm = np.zeros([1, self.att_dim*2+1, self.att_dim*2+1, 4])
        for j in range(self.att_dim * 2 + 1):
            for k in range(self.att_dim * 2 + 1):
                locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0])
        return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis],
                                      self.LM: locm})[0] + self.exploration_noise.noise()

    def action(self, s1, gm1, loc1):
        locm = np.zeros([1, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4])
        for j in range(self.att_dim * 2 + 1):
            for k in range(self.att_dim * 2 + 1):
                locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0])
        return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis],
                                      self.LM: locm})[0]

    def perceive(self, sd, p, loc, s, a_store, r, s_, loc_, done):
        self.memory.add(sd, p, loc, s, a_store, r, s_, loc_, done)
        if self.memory.count() > REPLAY_START:
            self.learn()
        if self.time_step % 500000 == 0:
            self.save_network()

    def learn(self):
        self.time_step += 1
        replay = self.memory.get_batch(BATCH_SIZE)
        bm_sd = np.asarray([data[0] for data in replay])
        bp = np.asarray([data[1] for data in replay])
        bloc = np.asarray([data[2] for data in replay])
        bs = np.asarray([data[3] for data in replay])
        ba = np.asarray([data[4] for data in replay])
        br = np.reshape(np.asarray([data[5] for data in replay]), [-1, 1])
        bs_ = np.asarray([data[6] for data in replay])
        bloc_ = np.asarray([data[7] for data in replay])
        bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1])
        for batch in range(BATCH_SIZE):
            sd1 = bm_sd[batch]
            terrian_map = grid_map(sd1, self.m_dim, self.pixel_meter, bp[batch])
            bgm[batch, :, :, 0] = terrian_map.map_matrix
        blocm = np.zeros([BATCH_SIZE, self.att_dim*2+1, self.att_dim*2+1, 4])
        blocm_ = np.zeros([BATCH_SIZE, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4])
        for i in range(BATCH_SIZE):
            for j in range(self.att_dim*2+1):
                for k in range(self.att_dim*2+1):
                    blocm[i, j, k, :] = np.array([i, bloc[i, 0]-self.att_dim+j, bloc[i, 1]-self.att_dim+k, 0])
                    blocm_[i, j, k, :] = np.array([i, bloc_[i, 0] - self.att_dim + j, bloc_[i, 1] - self.att_dim + k, 0])

        self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blocm})
        self.sess.run(self.ctrain, {self.GM: bgm, self.S: bs, self.LM: blocm, self.a: ba, self.R: br, self.S_: bs_, self.LM_: blocm_})

    def _build_a(self, s, gm, locm, reuse=None, custom_getter=None):

        def _conv2d_keep_size(x, y, kernel_size, name, use_bias=False, reuse_conv=None, trainable_conv=True):
            return tf.layers.conv2d(inputs=x,
                                    filters=y,
                                    kernel_size=kernel_size,
                                    padding="same",
                                    use_bias=use_bias,
                                    kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                    bias_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                    reuse=reuse_conv,
                                    name=name,
                                    trainable=trainable_conv)

        def _build_vin(mat, name, reuse, trainable_vin):
            h1 = _conv2d_keep_size(mat, 150, 3, name+"_h1", use_bias=True, reuse_conv=reuse, trainable_conv=trainable_vin)
            r = _conv2d_keep_size(h1, 1, 1, name+"_r", reuse_conv=reuse, trainable_conv=trainable_vin)
            q0 = _conv2d_keep_size(r, 10, 9, name+"_q0", reuse_conv=reuse, trainable_conv=trainable_vin)
            v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name+"_v")
            rv = tf.concat([r, v], axis=3)
            q = _conv2d_keep_size(rv, 10, 9, name + "_q", reuse_conv=False, trainable_conv=trainable_vin)
            v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v")
            for k in range(30):
                rv = tf.concat([r, v], axis=3)
                q = _conv2d_keep_size(rv, 10, 9, name+"_q", reuse_conv=True, trainable_conv=trainable_vin)
                v = tf.reduce_max(q, axis=3, keep_dims=True, name=name+"_v")
            return v

        trainable = True if reuse is None else False
        with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
            gv = _build_vin(gm, name="global_map_vin", reuse=reuse, trainable_vin=trainable)
            att = tf.reshape(tf.gather_nd(gv, locm), [-1, (self.att_dim*2+1)**2])
            layer_1 = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
            layer_2a = tf.layers.dense(layer_1, 600, name='l2a', trainable=trainable)
            layer_2att = tf.layers.dense(att, 600, name='l2att', trainable=trainable)
            layer_2 = tf.add(layer_2a, layer_2att, name="l2")
            layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable)
            a = tf.layers.dense(layer_3, 7, activation=tf.nn.tanh, name='a1', trainable=trainable)
            return a

    def _build_c(self, s, gm, loc, a, reuse=None, custom_getter=None):
        trainable = True if reuse is None else False
        with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
            gm_flat = tf.reshape(gm, [-1, self.m_dim**2])
            layer_gm = tf.layers.dense(gm_flat, self.s_dim, activation=tf.nn.relu, name='lgm', trainable=trainable)
            s_all = tf.concat([layer_gm, s], axis=1)
            layer_1 = tf.layers.dense(s_all, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
            layer_2s = tf.layers.dense(layer_1, 600, activation=None, name='l2s', trainable=trainable)
            layer_2a = tf.layers.dense(a, 600, activation=None, name='l2a', trainable=trainable)
            layer_2 = tf.add(layer_2s, layer_2a, name="l2")
            layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable)
            return tf.layers.dense(layer_3, 1, trainable=trainable)  # Q(s,a)

    def save_network(self):
        self.saver = tf.train.Saver()
        print("save ddpg-network...", self.time_step)
        self.saver.save(self.sess, 'saved_ddpg_networks/' + "ddpg-network", global_step=self.time_step)

    def load_network(self):
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_ddpg_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
Exemple #13
0
class Algorithm:
     def __init__(self):
         self.replay_buffer = ReplayBuffer(buffer_size=BUFFER_MAX, past_frame_len=FRAME_SKIP, multi_step=N_STEP)

     # Intial
     def Initial(self):
     # Initail your session or somethingself.target_net
     # restore neural net parameters
         # self.buffer_size = 0
         self.ctx = try_gpu(GPU_INDEX)
         self.frame_cnt = 0
         self.train_count = 0
         self.loss_sum = 0

         self.q_count = 0
         self.q_sum = 0
         self.dtype = DTYPE
         INPUT_SAMPLE = nd.random_uniform(0,1,(1, FRAME_SKIP, 11), self.ctx, self.dtype)
         self.target_net = self.get_net(INPUT_SAMPLE)
         self.policy_net = self.get_net(INPUT_SAMPLE)

         if MODEL_FILE is not None:
             print('%s: read trained results from [%s]' % (tm.strftime("%Y-%m-%d %H:%M:%S"), MODEL_FILE))
             self.policy_net.load_params(MODEL_FILE, ctx=self.ctx)
         self.update_target_net()
         # adagrad
         self.trainer = Trainer(self.policy_net.collect_params(),
                                optimizer=mx.optimizer.RMSProp(LEARNING_RATE, 0.95, 0.95))
         self.loss_func = loss.L2Loss()

         self.epsilon = EPSILON_START
         self.epsilon_min = EPSILON_MIN
         self.epsilon_rate = (EPSILON_START - EPSILON_MIN) / EPSILON_DECAY
         self.rng = np.random.RandomState(int(time() * 1000) % 100000000)


     def update_target_net(self):
         self.copy_params(self.policy_net, self.target_net)
         return

     def calculate_reward(self,end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len):
         if end_of_video <= 1.0:
             LANTENCY_PENALTY = 0.005
         else:
             LANTENCY_PENALTY = 0.01
         if not cdn_flag:
             reward_frame = frame_time_len * float(BIT_RATE[
                                                   bitrate]) / 1000 - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len
         else:
             reward_frame = -(REBUF_PENALTY * rebuf)
         if decision_flag or end_of_video:
             reward_frame += -1 * SMOOTH_PENALTY * (abs(BIT_RATE[bitrate] - BIT_RATE[last_bitrate]) / 1000)
         return reward_frame

     def run_frame(self,time, time_interval, send_data_size, chunk_len, \
                rebuf, buffer_size, play_time_len, end_delay, \
                cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \
                buffer_flag, cdn_flag, skip_flag, end_of_video,action,last_action,frame_time_len):

         bitrate, target_buffer, latency = self.action_to_submit(action)
         last_bitrate,_,_ = self.action_to_submit(last_action)
         reward_frame = self.calculate_reward(end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len)
         self.replay_buffer.insert_sample(time_interval, send_data_size, chunk_len, rebuf, buffer_size, play_time_len,end_delay, cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len,decision_flag, buffer_flag, cdn_flag, skip_flag, end_of_video, reward_frame, action)
         st = self.replay_buffer.get_current_state()
         st = nd.array(st, ctx=self.ctx, dtype=self.dtype).reshape((1, FRAME_SKIP, -1))
         action, max_q = self.choose_action(False, False, st)
         bit_rate, target_buffer, latency_limit = self.action_to_submit(action)
         self.frame_cnt += 1
         if max_q is not None:
             self.q_count += 1
             self.q_sum += max_q
         if  self.frame_cnt % TRAIN_PER_STEP == 0:
             state, s_, actions, rewards = self.replay_buffer.get_batch(16)
             loss = self.train_policy_net(state, actions, rewards, s_)
             self.train_count += 1
             self.loss_sum += loss
         # fixme 视频结束的时候是否需要清零
         if end_of_video:
             average_loss = self.loss_sum / (self.train_count + 0.0001)
             average_q = self.q_sum / (self.q_count + 0.000001)
             self.loss_sum = 0
             self.train_count = 0
             self.q_count = 0
             self.q_sum = 0
         else:
             average_loss = 0
             average_q = 0
         return reward_frame,bit_rate, target_buffer, latency_limit,action,average_loss,average_q

     def train_policy_net(self,states,actions,rewards,next_states):
         batch_size = actions.shape[0]
         s = states.shape
         states = nd.array(states,ctx=self.ctx,dtype=self.dtype)
         actions = nd.array(actions[:,0],ctx=self.ctx)
         rewards = nd.array(rewards[:,0],ctx=self.ctx)
         next_states = nd.array(next_states,ctx=self.ctx,dtype=self.dtype)

         next_qs = self.target_net(next_states)
         next_q_out = nd.max(next_qs,axis=1)

         target = rewards + next_q_out * 0.99 ** MULTI_STEP

         with autograd.record():
             current_qs = self.policy_net(states)
             current_q = nd.pick(current_qs,actions,1)
             loss = self.loss_func(target,current_q)
         loss.backward()
         self.trainer.step(16)
         total_loss = loss.mean().asscalar()
         return total_loss

     def save_params_to_file(self,model_path,mark):
         time_mark = tm.time()
         filename = model_path + '/net_' + str(mark) + '_' + str(time_mark) + '.model'
         self.policy_net.save_params(filename)
         print(tm.strftime(TIME_FORMAT), 'save results success:',filename)
         files = getNewestFile(model_path)
         if len(files) > 5:
             tmp = files[5:]
             for f in tmp:
                 if os.path.exists(model_path + "/" + f):
                     os.remove(model_path + "/" + f)
                     print(f + "is deleted.")

     def get_net(self, input_sample):
         if IS_DUELING:
             net = dueling_dqn.DuelingDQN()
             net.initialize(init.Xavier(), ctx=self.ctx)
         else:
             net = dueling_dqn.OriginDQN()
             net.initialize(init.Xavier(), ctx=self.ctx)
         net(input_sample)
         return net

     def choose_action(self, random_action, testing, st):
         self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_rate)
         max_q = None
         random_num = self.rng.rand()
         if random_action or ((not testing) and random_num < self.epsilon):
             action = self.rng.randint(0,ACTION_NUM)
         else:
             out = self.policy_net(st)
             max_index = nd.argmax(out, axis=1)
             action = int(max_index.astype(np.int).asscalar())
             max_q = out[0, action].asscalar()
         return action, max_q

     def action_to_submit(self,action):
         bit_rate = action % 4
         target_buffer = action // 4
         latency_limit = 4
         return bit_rate, target_buffer, latency_limit


     #Define your al
     def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
             S_end_delay, S_decision_flag, S_buffer_flag,S_cdn_flag,S_skip_time,
             end_of_video, cdn_newest_id,download_id,cdn_has_frame,IntialVars):
         # state = np.empty(shape=(len(S_time_interval),11),dtype=np.float32)
         S_end_of_video = [0] * FRAME_SKIP
         S_end_of_video[-1] = end_of_video
         state = [S_time_interval[-FRAME_SKIP:],S_send_data_size[-FRAME_SKIP:],S_chunk_len[-FRAME_SKIP:], S_buffer_size[-FRAME_SKIP:], S_rebuf[-FRAME_SKIP:],
                      S_end_delay[-FRAME_SKIP:],  S_play_time_len[-FRAME_SKIP:],S_decision_flag[-FRAME_SKIP:], S_cdn_flag[-FRAME_SKIP:],S_skip_time[-FRAME_SKIP:],S_end_of_video]

         state = nd.array(state,dtype=self.dtype).transpose((1,0)).reshape((1,FRAME_SKIP,-1))
         # print(state.shape)

         action, max_q = self.choose_action(False,True,state)
         # print(action)
         bit_rate, target_buffer, latency_limit = self.action_to_submit(action)
         print(bit_rate, target_buffer, latency_limit)

         return bit_rate, target_buffer, latency_limit

     def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
             S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, S_skip_time, end_of_video, cdn_newest_id,
             download_id, cdn_has_frame, IntialVars):

         # If you choose the marchine learning
         '''state = []
         state[0] = ...
         state[1] = ...
         state[2] = ...
         state[3] = ...
         state[4] = ...
         decision = actor.predict(state).argmax()
         bit_rate, target_buffer = decison//4, decison % 4 .....
         return bit_rate, target_buffer'''

         # If you choose BBA
         RESEVOIR = 0.5
         CUSHION = 1.5

         if S_buffer_size[-1] < RESEVOIR:
             bit_rate = 0
         elif S_buffer_size[-1] >= RESEVOIR + CUSHION and S_buffer_size[-1] < CUSHION + CUSHION:
             bit_rate = 2
         elif S_buffer_size[-1] >= CUSHION + CUSHION:
             bit_rate = 3
         else:
             bit_rate = 1

         target_buffer = 0
         latency_limit = 4

         return bit_rate, target_buffer, latency_limit




     def get_params(self):
     # get your params
        your_params = []
        return your_params

     def copy_params(self, src_net, dst_net):
         ps_src = src_net.collect_params()
         ps_dst = dst_net.collect_params()
         prefix_length = len(src_net.prefix)
         for k, v in ps_src.items():
             k = k[prefix_length:]
             v_dst = ps_dst.get(k)
             v_dst.set_data(v.data())
            else:
                # Choose best action
                a = np.argmax(q_values)
            # Perform choosen action
            next_state, reward, done, _ = env.step(a)
            episode_reward += reward
            episode_steps += 1
            # Insert into replay buffer
            repbuf.add_sample((state, a, reward, next_state, done))
            state = next_state
            # Stats
            total_max_q += q_values.max()
            # Check if we need to train
            if step % STEPS_TO_TRAIN == 0:
                # Get a batch from replaybuffer
                batch = repbuf.get_batch(BATCH_SIZE)
                state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
                pred_nextQ = sess.run(target_dqn.logits, feed_dict={target_dqn.input: next_state_batch})
                max_nextQ = np.max(pred_nextQ, axis=1)
                pred_values = np.array(reward_batch) + np.invert(done_batch).astype('float32') * GAMMA * max_nextQ
                cost = dqn.train(state_batch, action_batch, pred_values, sess)


    elif FLAGS.mode == 'test':
        # Testing mode
        epsilon = 0.05
        rewards = []
        for _ in trange(100):
            done = False
            obs = env.reset()
            reward = 0
Exemple #15
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def action(self, state):
        action = self.actor_network.action(state)

        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        return self.time_step
Exemple #17
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # print(minibatch)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        # print(q_value_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("action_batch[0]", file=f)
            print(action_batch[0], file=f)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("q_gradient_batch[0]", file=f)
            print(q_gradient_batch[0], file=f)
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action2(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        random_action = np.zeros(10, float)
        random_action[random.randint(0, 3)] = 1
        random_action[4] = random.uniform(-100, 100)  #DASH POWER
        random_action[5] = random.uniform(-180, 180)  #DASH DEGREES
        random_action[6] = random.uniform(-180, 180)  #TURN DEGREES
        random_action[7] = random.uniform(-180, 180)  #TACKLE DEGREES
        random_action[8] = random.uniform(0, 100)  #KICK POWER
        random_action[9] = random.uniform(-180, 180)  #KICK DEGREES
        if np.random.uniform() < EPSILON:
            return action
        else:
            return random_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #18
0
class MaDDPG:
    def __init__(self, num_agents, state_dim, action_dim):
        # track training times
        self.time_step = 0
        # use set session use GPU
        #self.sess = tf.InteractiveSession()
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=True))
        self.num_agents = num_agents
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents = self.create_multi_agents(self.sess, num_agents,
                                               self.state_dim, self.action_dim)
        # make sure create Criticnetwork later, summarise mean Q value inside
        self.critic = CriticNetwork(self.sess, state_dim, action_dim)
        self.exploration_noise = OUNoise((self.num_agents, action_dim))
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        # for store checkpoint
        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim))
        action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim))
        reward_batch = np.zeros((BATCH_SIZE, self.num_agents))
        next_state_batch = np.zeros(
            (BATCH_SIZE, self.num_agents, self.state_dim))
        done_batch = np.zeros((BATCH_SIZE))
        for ii in range(BATCH_SIZE):
            state_batch[ii, :, :] = minibatch[ii][0]
            action_batch[ii, :, :] = minibatch[ii][1]
            reward_batch[ii, :] = minibatch[ii][2]
            next_state_batch[ii, :, :] = minibatch[ii][3]
            done_batch[ii] = minibatch[ii][4]

        # calculate Gt batch
        next_action_batch = self.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        gt = np.zeros((BATCH_SIZE, self.num_agents))
        for ii in range(BATCH_SIZE):
            if done_batch[ii]:
                gt[ii, :] = reward_batch[ii, :]
            else:
                gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :]
        #update critic by minimizing the loss
        self.critic.train(gt, state_batch, action_batch)

        # update policy using the sampling gradients
        actions_for_grad = self.actions(state_batch)
        q_gradients_batch = self.critic.gradients(state_batch,
                                                  actions_for_grad)
        self.train_agents(q_gradients_batch, state_batch)

        # update critic target network
        self.critic.update_target()

        # update actor target
        self.update_agents_target()

    def summary(self, record_num):
        if self.replay_buffer.count() > SUMMARY_BATCH_SIZE:
            mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE)
            state_batch = np.zeros(
                (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim))
            for ii in range(SUMMARY_BATCH_SIZE):
                state_batch[ii, :, :] = mini_batch[ii][0]

            actions_for_summary = self.actions(state_batch)
            self.critic.write_summaries(state_batch, actions_for_summary,
                                        record_num)

    def update_agents_target(self):
        for agent in self.agents:
            agent.update_target()

    def train_agents(self, gradients_batch, state_batch):
        # gradients_batch = [batchsize* agents* action_dim]
        # state_batch = [batchsize* agents * state_dim ]
        for ii in range(self.num_agents):
            grad = gradients_batch[:, ii, :]
            state = state_batch[:, ii, :]
            self.agents[ii].train(grad, state)

    def create_multi_agents(self, sess, num_agents, state_dim, action_dim):
        agents = []
        nets = None
        for ii in range(num_agents):
            agent_name = 'agent' + str(ii)
            agents.append(
                ActorNetwork(sess, state_dim, action_dim, agent_name, nets))
            nets = agents[-1].nets
        return agents

    def add_agents(self, add_num):
        for ii in range(add_num):
            #self.num_agents+=1

            agent_name = 'agent' + str(self.num_agents)
            self.agents.append(
                ActorNetwork(self.sess, self.state_dim, self.action_dim,
                             agent_name, self.agents[-1].nets))
            # the agents' name is from 0-num_agents-1
            self.num_agents += 1

        # if add a new agent then reset the noise and replay buffer
        self.exploration_noise = OUNoise((self.num_agents, self.action_dim))
        #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.replay_buffer.erase()
        # re-create a saver
        # the new saver will contains all the savable variables.
        # otherwise only contains the initially created agents
        self.saver = tf.train.Saver()
        # reset the time step
        # self.time_step = 0

    def action(
        self, state
    ):  # here is action, for one state on agent, not batch_sized actions
        # state = [num_agents * state_dim]
        # actions = [num_agents *  action_dim]
        action = np.zeros((self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            action[ii, :] = self.agents[ii].action(state[ii, :])
        return action

    def actions(self, state_batch):
        #state = batch_size*numOfagents*state_dim
        #actions = batch_size*numOfagents*action_dim
        batch_size = state_batch.shape[0]
        actions = np.zeros((batch_size, self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :])
        return actions

    def target_actions(self, state_batch):
        # the state size  is batch_size* num_agents * state_dimension
        actions = np.zeros(
            (state_batch.shape[0], self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:,
                    ii, :] = self.agents[ii].target_actions(state_batch[:,
                                                                        ii, :])
        return actions

    def noise_action(self, state):
        action = self.action(state)
        # clip the action, action \in [-1,+1]
        return np.clip(action + self.exploration_noise.noise(), -1, 1)

    def close_session(self):
        self.sess.close()

    def perceive(self, state, action, reward, next_state, done):
        # store {st,at,Rt+1,st+1}
        self.replay_buffer.add(state, action, reward, next_state, done)

        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()
            if self.time_step % SAVE_STEPS == 0:
                self.save_network()
            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

            # Re-initialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state("saved_network")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print('Could not find old network weights')

    def save_network(self):
        # do not processing under Dropbox
        #  exit drop box then run
        print('save network...', self.time_step)
        self.saver.save(self.sess,
                        'saved_network/' + 'network',
                        global_step=self.time_step)
Exemple #19
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #20
0
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, m_dim, att_dim):
        self.memory = ReplayBuffer(MEMORY_CAPACITY)
        self.pointer = 0
        self.sess = tf.Session()

        self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.att_dim = a_dim, s_dim, a_bound, m_dim, att_dim
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'r')
        self.LM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'l')
        self.LM_ = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'l')

        self.a = self._build_a(
            self.S,
            self.GM,
            self.LM,
        )
        q = self._build_c(
            self.S,
            self.GM,
            self.LM,
            self.a,
        )
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope='Actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope='Critic')
        ema = tf.train.ExponentialMovingAverage(decay=1 -
                                                TAU)  # soft replacement

        def ema_getter(getter, name, *args, **kwargs):
            return ema.average(getter(name, *args, **kwargs))

        target_update = [ema.apply(a_params),
                         ema.apply(c_params)]  # soft update operation
        a_ = self._build_a(
            self.S_, self.GM, self.LM_, reuse=True,
            custom_getter=ema_getter)  # replaced target parameters
        q_ = self._build_c(self.S_,
                           self.GM,
                           self.LM_,
                           a_,
                           reuse=True,
                           custom_getter=ema_getter)

        a_loss = -tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss,
                                                            var_list=a_params)

        with tf.control_dependencies(
                target_update):  # soft replacement happened at here
            q_target = self.R + GAMMA * q_
            td_error = tf.losses.mean_squared_error(labels=q_target,
                                                    predictions=q)
            self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(
                td_error, var_list=c_params)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s1, gm1, lm1):
        return self.sess.run(
            self.a, {
                self.S: s1[np.newaxis, :],
                self.GM: gm1[np.newaxis, :, :, np.newaxis],
                self.LM: lm1[np.newaxis, :, :, np.newaxis]
            })[0]

    def learn(self):
        replay = self.memory.get_batch(BATCH_SIZE)
        bm_sd = np.asarray([data[0] for data in replay])
        bs = np.asarray([data[1] for data in replay])
        bloc = np.asarray([data[2] for data in replay])
        ba = np.asarray([data[3] for data in replay])
        br = np.asarray([data[4] for data in replay])
        bs_ = np.asarray([data[5] for data in replay])
        bloc_ = np.asarray([data[6] for data in replay])
        bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1])
        blm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1])
        blm_ = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1])
        for batch in range(BATCH_SIZE):
            sd1 = bm_sd[batch]
            terrian_map = TerrainMap(sd1, MAP_DIM, GLOBAL_PIXEL_METER)
            bgm[batch, :, :, 0] = terrian_map.map_matrix
            blm[batch, :, :, 0] = terrian_map.get_local_map(bloc[batch, :])
            blm_[batch, :, :, 0] = terrian_map.get_local_map(bloc_[batch, :])

        self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blm})
        self.sess.run(self.ctrain, {
            self.S: bs,
            self.a: ba,
            self.R: br,
            self.S_: bs_,
            self.LM_: blm_
        })

    def _build_a(self, s, gm, locm, reuse=None, custom_getter=None):
        def _conv2d_keep_size(x,
                              y,
                              kernel_size,
                              name,
                              use_bias=False,
                              reuse_conv=None,
                              trainable_conv=True):
            return tf.layers.conv2d(
                inputs=x,
                filters=y,
                kernel_size=kernel_size,
                padding="same",
                use_bias=use_bias,
                kernel_initializer=tf.truncated_normal_initializer(
                    stddev=0.01),
                bias_initializer=tf.truncated_normal_initializer(stddev=0.01),
                reuse=reuse_conv,
                name=name,
                trainable=trainable_conv)

        def _build_vin(mat, name, trainable_vin):
            h1 = _conv2d_keep_size(mat,
                                   150,
                                   5,
                                   name + "_h1",
                                   use_bias=True,
                                   trainable_conv=trainable_vin)
            r = _conv2d_keep_size(h1,
                                  1,
                                  1,
                                  name + "_r",
                                  trainable_conv=trainable_vin)
            q0 = _conv2d_keep_size(r,
                                   10,
                                   5,
                                   name + "_q0",
                                   trainable_conv=trainable_vin)
            v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name + "_v")
            for k in range(30):
                rv = tf.concat([r, v], axis=3)
                q = _conv2d_keep_size(rv,
                                      10,
                                      5,
                                      name + "_q",
                                      reuse_conv=tf.AUTO_REUSE,
                                      trainable_conv=trainable_vin)
                v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v")
            return v

        trainable = True if reuse is None else False
        with tf.variable_scope('Actor',
                               reuse=reuse,
                               custom_getter=custom_getter):
            gv = _build_vin(gm, name="global_map_vin", trainable_vin=trainable)
            loc_co = _conv2d_keep_size(locm,
                                       1,
                                       9,
                                       name="local_co",
                                       use_bias=False,
                                       trainable_conv=trainable)
            lv = tf.multiply(gv, loc_co)
            m_flat = tf.reshape(lv, [-1, self.m_dim**2])
            att = tf.layers.dense(m_flat,
                                  self.att_dim,
                                  name='att_l1',
                                  trainable=trainable)
            layer_1 = tf.layers.dense(s,
                                      300,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            layer_2a = tf.layers.dense(layer_1,
                                       600,
                                       name='l2a',
                                       trainable=trainable)
            layer_2att = tf.layers.dense(att,
                                         600,
                                         name='l2att',
                                         trainable=trainable)
            layer_2 = tf.add(layer_2a, layer_2att, name="l2")
            layer_3 = tf.layers.dense(layer_2,
                                      600,
                                      activation=tf.nn.relu,
                                      name='l3',
                                      trainable=trainable)
            a1 = tf.layers.dense(layer_3,
                                 4,
                                 activation=tf.nn.tanh,
                                 name='a1',
                                 trainable=trainable)
            a1_norm = tf.nn.l2_normalize(a1, dim=-1)
            a2 = tf.layers.dense(layer_3,
                                 3,
                                 activation=tf.nn.tanh,
                                 name='a2',
                                 trainable=trainable)
            a = tf.concat([a1_norm, a2], axis=-1)
            return tf.multiply(a, self.a_bound, name='scaled_a')

    def _build_c(self, s, gm, locm, a, reuse=None, custom_getter=None):
        trainable = True if reuse is None else False
        with tf.variable_scope('Critic',
                               reuse=reuse,
                               custom_getter=custom_getter):
            gm_flat = tf.reshape(gm, [-1, self.m_dim**2])
            layer_gm = tf.layers.dense(gm_flat,
                                       self.s_dim,
                                       activation=tf.nn.relu,
                                       name='lgm',
                                       trainable=trainable)
            lm_flat = tf.reshape(locm, [-1, self.m_dim**2])
            layer_lm = tf.layers.dense(lm_flat,
                                       self.s_dim,
                                       activation=tf.nn.relu,
                                       name='llm',
                                       trainable=trainable)
            s_all = tf.concat([layer_gm, layer_lm, s], axis=0)
            layer_1 = tf.layers.dense(s_all,
                                      300,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            layer_2s = tf.layers.dense(layer_1,
                                       600,
                                       activation=None,
                                       name='l2s',
                                       trainable=trainable)
            layer_2a = tf.layers.dense(a,
                                       600,
                                       activation=None,
                                       name='l2a',
                                       trainable=trainable)
            layer_2 = tf.add(layer_2s, layer_2a, name="l2")
            layer_3 = tf.layers.dense(layer_2,
                                      600,
                                      activation=tf.nn.relu,
                                      name='l3',
                                      trainable=trainable)
            return tf.layers.dense(layer_3, 1, trainable=trainable)  # Q(s,a)
Exemple #21
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'
        self.environment = env
        self.episode = 0
        self.epsilon = 0.98
        self.one_number = 1
        self.mean = []
        self.state_dim = len(obs2state(env.reset().observation))
        self.action_dim = env.action_spec().shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        self.critic_network.train(y_batch, state_batch, action_batch)
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        exp = self.exploration_noise.noise()
        t = action * exp
        return exp

    def action(self, state):
        if np.random.rand() <= self.epsilon:
            act = self.noise_action(state)
            z = array(act)
        else:
            action = self.actor_network.action(state)
            z = array(action)
        self.mean.append(z[0])
        g = np.tanh(z)
        return g

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
        if self.epsilon > 0.1:
            self.epsilon *= 0.99999

        if done:
            self.exploration_noise.reset()
Exemple #22
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return
Exemple #23
0
class DDPG(object):
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0

    def train(self):
        self.time_step = self.time_step + 1
        self.epsilon_expert -= (self.epsilon_expert_range[0] -
                                self.epsilon_expert_range[1]) / EXPLORE_COUNT
        self.epsilon_expert = max(self.epsilon_expert,
                                  self.epsilon_expert_range[1])
        self.epsilon_random -= (self.epsilon_random_range[0] -
                                self.epsilon_random_range[1]) / EXPLORE_COUNT
        self.epsilon_random = max(self.epsilon_random,
                                  self.epsilon_random_range[1])
        logger.debug(
            "step: %d, epsilon_expert: %s, epsilon_random: %s" %
            (self.time_step, self.epsilon_expert, self.epsilon_random))
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
            # if done_batch[i]:
            #     y_batch.append(reward_batch[i])
            # else :
            #     y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_cost = self.critic_network.train(y_batch, state_batch,
                                                     action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = self.exploration_noise.noise(action)
    #     noise_action = action + noise
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = np.zeros(self.action_dim)
    #     noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10)
    #     noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
    #     noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10)
    #     noise_action = action + noise
    #     logger.debug("action: %s, noise: %s" % (action, noise))
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        return action

    def opposite_action(self, state):
        logger.debug("state: %s" % (state))
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        action[0] = 1 - action[0]
        logger.debug("opposite action: %s" % (action))
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >= REPLAY_START_SIZE:
            # logger.debug("train...")
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'DDPG')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemple #24
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, data_fname, replay=False):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        if replay:
            self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE,
                                              data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN)
        print("######## TRAINING #########")
        for k in range(Hp.N_TRAIN):
            minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size]
            state_batch_r = np.asarray([data[0] for data in minibatch])
            state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(state_batch_r[:, j], axis=0)
                state_batch.append(new_cat)
            #state_batch = [np.expand_dims(state_batch, axis=1)]
            action_batch = np.asarray([data[1] for data in minibatch])
            reward_batch = np.asarray([data[2] for data in minibatch])
            next_state_batch_r = np.asarray([data[3] for data in minibatch])
            next_state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(next_state_batch_r[:, j], axis=0)
                next_state_batch.append(new_cat)
            #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
            done_batch = np.asarray([data[4] for data in minibatch])

            # for action_dim = 1
            action_batch = np.resize(action_batch,
                                     [Hp.batch_size, self.action_dim])

            next_action_batch = self.actor_network.target_actions(
                self.target_state_input, next_state_batch)
            q_value_batch = self.critic_network.target_q(
                self.target_state_input, next_state_batch, next_action_batch)
            y_batch = []

            for i in range(len(minibatch)):
                if done_batch[i]:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   Hp.GAMMA * q_value_batch[i])

            y_batch = np.resize(y_batch, [Hp.batch_size, 1])

            # Update critic by minimizing the loss L
            self.critic_network.train(y_batch, self.state_input, state_batch,
                                      action_batch)

            # Update the actor policy using the sampled gradient:
            action_batch_for_gradients = self.actor_network.actions(
                self.state_input, state_batch)
            q_gradient_batch = self.critic_network.gradients(
                self.state_input, state_batch, action_batch_for_gradients)

            self.summary_str2 = self.actor_network.train(
                q_gradient_batch, self.state_input, state_batch)

            # Update the target networks
            self.actor_network.update_target()
            self.critic_network.update_target()
            self.state_network.update_target()

    def train_off(self, minibatch):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        state_batch_r = np.asarray([data[0] for data in minibatch])
        state_batch = []
        for j in range(Hp.categories):
            new_cat = np.stack(state_batch_r[:, j], axis=0)
            state_batch.append(new_cat)
        #state_batch = [np.expand_dims(state_batch, axis=1)]
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch_r = np.asarray([data[3] for data in minibatch])
        next_state_batch = []
        for j in range(Hp.categories):
            new_cat = np.stack(next_state_batch_r[:, j], axis=0)
            next_state_batch.append(new_cat)
        #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,
                                 [Hp.batch_size, self.action_dim])

        next_action_batch = self.actor_network.target_actions(
            self.target_state_input, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.target_state_input,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []

        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [Hp.batch_size, 1])

        # Update critic by minimizing the loss L
        cost, self.summary_str2 = self.critic_network.train_off(
            y_batch, self.state_input, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.state_input, state_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.state_input, state_batch, action_batch_for_gradients)

        summary_str3 = self.actor_network.train(q_gradient_batch,
                                                self.state_input, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        self.state_network.update_target()
        return cost

    def action(self, state):
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        return np.multiply(action, np.array([-35.0, 35.0, 2000.0]))
class Worker:
    """docstring for DDPG"""
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')

    def start(self, setting=0):
        self.env = RunEnv(visualize=True)
        self.setting = setting

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(
            self.sess, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.sess,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.sess, selfstate_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.sess, state_batch, action_batch_for_gradients)

        self.actor_network.train(self.sess, q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target(self.sess)
        self.critic_network.update_target(self.sess)

    def save_model(self, saver, episode):
        #if self.episode % 10 == 1:
        if self.name == 'worker_0':
            saver.save(self.sess,
                       self.model_path + "/model-" + str(episode) + ".ckpt")

    def noise_action(self, state, decay):
        # Select action a_t according to the current policy and exploration noise which gradually vanishes
        action = self.actor_network.action(self.sess, state)
        return action + self.exploration_noise.noise() * decay

    def action(self, state):
        action = self.actor_network.action(self.sess, state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE and self.training:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def work(self, coord, saver):
        if self.training:
            episode_count = self.sess.run(self.global_episodes)
        else:
            episode_count = 0
        wining_episode_count = 0
        total_steps = 0
        print("Starting worker_" + str(self.number))

        with self.sess.as_default(), self.sess.graph.as_default():
            #not_start_training_yet = True
            while not coord.should_stop():
                returns = []
                rewards = []
                episode_reward = 0

                if np.random.rand(
                ) < 0.9:  # change Aug20 stochastic apply noise
                    noisy = True
                    self.decay -= 1. / self.explore
                else:
                    noisy = False

                self.sess.run(self.update_local_ops_actor)
                self.sess.run(self.update_local_ops_critic)

                state = self.env.reset(difficulty=self.setting)
                #print(observation)
                s = process_frame(state)

                print "episode:", episode_count
                # Train

                for step in xrange(self.env.spec.timestep_limit):
                    state = process_frame(state)
                    if noisy:
                        action = np.clip(
                            self.noise_action(state, np.maximum(self.decay,
                                                                0)), 0.0, 1.0
                        )  # change Aug20, decay noise (no noise after ep>=self.explore)
                    else:
                        action = self.action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
                    next_state = process_frame(next_state)
                    self.perceive(state, action, reward * 100, next_state,
                                  done)
                    state = next_state
                    episode_reward += reward
                    if done:
                        break

                if episode % 5 == 0:
                    print "episode reward:", reward_episode

                # Testing:
                #if episode % 1 == 0:
                if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1:  # change Aug19
                    self.save_model(saver, episode_count)
                    total_return = 0
                    ave_reward = 0
                    for i in xrange(TEST):
                        state = self.env.reset()
                        reward_per_step = 0
                        for j in xrange(self.env.spec.timestep_limit):
                            action = self.action(
                                process_frame(state))  # direct action for test
                            state, reward, done, _ = self.env.step(action)
                            total_return += reward
                        if done:
                            break
                            reward_per_step += (reward -
                                                reward_per_step) / (j + 1)
                        ave_reward += reward_per_step

                    ave_return = total_return / TEST
                    ave_reward = ave_reward / TEST
                    returns.append(ave_return)
                    rewards.append(ave_reward)

                    print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward

                if self.name == 'worker_0' and self.training:
                    sess.run(self.increment)
                episode_count += 1

# All done Stop trail
# Confirm exit
            print('Done ' + self.name)
Exemple #26
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #27
0
class DDPG:
    """docstring for DDPG"""


    def __init__(self, a_dim, s_dim):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = s_dim
        self.action_dim = a_dim
        self.time_step=0
        self.max_bw = 0.0
        self.max_cwnd = 0.0
        self.min_rtt = 9999999.0

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def learn(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        self.time_step += 1
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise()
        # print("noise:" + str(noise))
        return action + noise

    def choose_action(self, state):
        self.time_step += 1
        # print("_______________________choose_action_____________________")
        action = self.actor_network.action(state)
        return action

    def store_transition(self, s, a, r, s_,done,episode_count):

        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # print("*********************************ADD****************************")
        self.replay_buffer.add(s, a, r, s_, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            if((episode_count+1)%100!= 0):
                self.learn()
                # print("learn!")
            else:
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)


        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def extract_observation(self,dataRecorder,subflow_index,state_before):
        # print("extracting...")
        value_dic = dataRecorder.get_latest_data()
        state_after=state_before.reshape(10,5)
        # observation = np.zeros((4))
        observation = np.zeros((5))
        t_cWnd=[0,0]
        t_thr=[0,0]
        t_rtt=[0,0]
        t_loss_rate=[0,0]
        t_unAck=[0,0]
        s0=[0,0,0,0,0]
        state=np.zeros(1)
        for i in range(value_dic["nbOfSubflows"]):
            name = "cWnd" + str(i)
            t_cWnd[i] = value_dic[name]
            name = "rtt"+str(i)
            t_rtt[i] = value_dic[name]
            name = "unAck" + str(i)
            t_unAck[i]=value_dic[name]
            name = "loss_rate" + str(i)
            t_loss_rate[i]=value_dic[name]
            name = "throughput" + str(i)
            t_thr[i]=value_dic[name]

        thr=t_thr[subflow_index]
        s0[0]=t_thr[subflow_index]

        rtt=t_rtt[subflow_index]
        s0[1]=t_rtt[subflow_index]

        cwnd=t_cWnd[subflow_index]
        s0[2]=t_cWnd[subflow_index]

        loss_rate=t_loss_rate[subflow_index]
        s0[3]=t_loss_rate[subflow_index]

        unAck=t_unAck[subflow_index]
        s0[4]=t_unAck[subflow_index]


        s0=np.array(s0)
        min_=s0-s0

        thr_n=s0[0]
        thr_n_min=s0[0]-min_[0]
        rtt_min=s0[1]-min_[1]
        cwnd_n_min=s0[2]-min_[2]
        loss_rate_n_min=s0[3]-min_[3]
        unAck_n_min=s0[4]-min_[4]

        # loss_rate_n_min=s0[7]-min_[7]

        if self.max_bw<thr_n_min:
            self.max_bw=thr_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.min_rtt>rtt_min:
            self.min_rtt=rtt_min

        
        reward  = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min
        print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+"  delta_rtt"+str(rtt_min-self.min_rtt))
        # print("unAck:"+str(unAck_n_min))
        if self.max_bw!=0:
            state[0]=thr_n_min/self.max_bw
            # tmp=pacing_rate_n_min/self.max_bw
            state=np.append(state,[5*loss_rate_n_min])
            state=np.append(state,[unAck_n_min])
        else:
            state[0]=0
            state=np.append(state,[0])
            state=np.append(state,[0])
        state=np.append(state,[1400/cwnd])
        state=np.append(state,[self.min_rtt/rtt_min])

        state_after=np.delete(state_after,[0],axis = 0)
        state_after=np.append(state_after,state)
        

        return state_after,reward,thr_n_min,rtt_min