class NeuralQLearner(object):
    def __init__(
            self,
            session,
            optimizer,
            q_network,
            restore_net_path,
            state_dim,
            num_actions,
            batch_size,
            init_exp,  # initial exploration prob
            final_exp,  # final exploration prob
            anneal_steps,  # N steps for annealing exploration
            replay_buffer_size,
            store_replay_every,  # how frequent to store experience
            discount_factor,  # discount future rewards
            target_update_rate,
            reg_param,  # regularization constants
            max_gradient,  # max gradient norms
            double_q_learning,
            summary_writer,
            summary_every):

        # tensorflow machinery
        self.session = session
        self.optimizer = optimizer
        self.summary_writer = summary_writer

        # model components
        self.q_network = q_network
        self.restore_net_path = restore_net_path
        self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size)

        # Q learning parameters
        self.batch_size = batch_size
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps
        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate
        self.double_q_learning = double_q_learning

        # training parameters
        self.max_gradient = max_gradient
        self.reg_param = reg_param

        # counters
        self.store_replay_every = store_replay_every
        self.store_experience_cnt = 0
        self.train_iteration = 0

        # create and initialize variables
        self.create_variables()

        if self.restore_net_path is not None:
            saver = tf.train.Saver()
            saver.restore(self.session, self.restore_net_path)
        else:
            var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
            self.session.run(tf.initialize_variables(var_lists))

        #var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        #self.session.run(tf.variables_initializer(var_lists))

        # make sure all variables are initialized
        self.session.run(tf.assert_variables_initialized())

        self.summary_every = summary_every
        if self.summary_writer is not None:
            # graph was not available when journalist was created
            self.summary_writer.add_graph(self.session.graph)
            self.summary_every = summary_every

    def create_variables(self):
        # compute action from a state: a* = argmax_a Q(s_t,a)
        with tf.name_scope("predict_actions"):
            # raw state representation
            self.states = tf.placeholder(tf.float32, (None, self.state_dim),
                                         name="states")
            # initialize Q network
            with tf.variable_scope("q_network"):
                self.q_outputs = self.q_network(self.states)
            # predict actions from Q network
            self.action_scores = tf.identity(self.q_outputs,
                                             name="action_scores")
            tf.summary.histogram("action_scores", self.action_scores)
            self.predicted_actions = tf.argmax(self.action_scores,
                                               dimension=1,
                                               name="predicted_actions")

        # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a)
        with tf.name_scope("estimate_future_rewards"):
            self.next_states = tf.placeholder(tf.float32,
                                              (None, self.state_dim),
                                              name="next_states")
            self.next_state_mask = tf.placeholder(tf.float32, (None, ),
                                                  name="next_state_masks")

            if self.double_q_learning:
                # reuse Q network for action selection
                with tf.variable_scope("q_network", reuse=True):
                    self.q_next_outputs = self.q_network(self.next_states)
                self.action_selection = tf.argmax(tf.stop_gradient(
                    self.q_next_outputs),
                                                  1,
                                                  name="action_selection")
                tf.summary.histogram("action_selection", self.action_selection)
                self.action_selection_mask = tf.one_hot(
                    self.action_selection, self.num_actions, 1, 0)
                # use target network for action evaluation
                with tf.variable_scope("target_network"):
                    self.target_outputs = self.q_network(
                        self.next_states) * tf.cast(self.action_selection_mask,
                                                    tf.float32)
                self.action_evaluation = tf.reduce_sum(self.target_outputs,
                                                       axis=[
                                                           1,
                                                       ])
                tf.summary.histogram("action_evaluation",
                                     self.action_evaluation)
                self.target_values = self.action_evaluation * self.next_state_mask
            else:
                # initialize target network
                with tf.variable_scope("target_network"):
                    self.target_outputs = self.q_network(self.next_states)
                # compute future rewards
                self.next_action_scores = tf.stop_gradient(self.target_outputs)
                #self.target_values = tf.reduce_max(self.next_action_scores, axis=[1, ]) * self.next_state_mask
                self.target_values = tf.reduce_max(self.next_action_scores,
                                                   reduction_indices=[
                                                       1,
                                                   ]) * self.next_state_mask
                tf.summary.histogram("next_action_scores",
                                     self.next_action_scores)

            self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards")
            self.future_rewards = self.rewards + self.discount_factor * self.target_values

        # compute loss and gradients
        with tf.name_scope("compute_temporal_differences"):
            # compute temporal difference loss
            self.action_mask = tf.placeholder(tf.float32,
                                              (None, self.num_actions),
                                              name="action_mask")
            #self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, axis=[1, ])
            self.masked_action_scores = tf.reduce_sum(self.action_scores *
                                                      self.action_mask,
                                                      reduction_indices=[
                                                          1,
                                                      ])
            self.temp_diff = self.masked_action_scores - self.future_rewards
            self.norm_diff = tf.square(
                tf.sigmoid(self.masked_action_scores / 100.0) -
                tf.sigmoid(self.future_rewards / 100.0))
            #self.norm_diff = tf.nn.sigmoid(tf.square(self.temp_diff)/40000.0)
            self.td_loss = tf.reduce_mean(self.norm_diff) * 20000.0
            # regularization loss
            q_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
            self.reg_loss = self.reg_param * tf.reduce_sum(
                [tf.reduce_sum(tf.square(x)) for x in q_network_variables])
            # compute total loss and gradients
            self.loss = self.td_loss + self.reg_loss
            gradients = self.optimizer.compute_gradients(self.loss)
            # clip gradients by norm
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    self.max_gradient), var)
            # add histograms for gradients.
            for grad, var in gradients:
                tf.summary.histogram(var.name, var)
                if grad is not None:
                    tf.summary.histogram(var.name + '/gradients', grad)
            self.train_op = self.optimizer.apply_gradients(gradients)

        # update target network with Q network
        with tf.name_scope("update_target_network"):
            self.target_network_update = []
            # slowly update target network parameters with Q network parameters
            q_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
            target_network_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network")
            for v_source, v_target in zip(q_network_variables,
                                          target_network_variables):
                # this is equivalent to target = (1-alpha) * target + alpha * source
                update_op = v_target.assign_sub(self.target_update_rate *
                                                (v_target - v_source))
                self.target_network_update.append(update_op)
            self.target_network_update = tf.group(*self.target_network_update)

        # scalar summaries
        tf.summary.scalar("td_loss", self.td_loss)
        #tf.summary.scalar("reg_loss", self.reg_loss)
        tf.summary.scalar("total_loss", self.loss)
        tf.summary.scalar("exploration", self.exploration)

        self.summarize = tf.summary.merge_all()
        self.no_op = tf.no_op()

    def storeExperience(self, state, action, reward, next_state, done):
        # always store end states
        if self.store_experience_cnt % self.store_replay_every == 0 or done:
            self.replay_buffer.add(state, action, reward, next_state, done)
        self.store_experience_cnt += 1

    def eGreedyAction(self, states, explore=True):
        if explore and self.exploration > random.random():
            return random.randint(0, self.num_actions - 1)
        else:
            return self.session.run(self.predicted_actions,
                                    {self.states: states})[0]

    def annealExploration(self, stategy='linear'):
        ratio = max((self.anneal_steps - self.train_iteration) /
                    float(self.anneal_steps), 0)
        self.exploration = (self.init_exp -
                            self.final_exp) * ratio + self.final_exp

    def updateModel(self, episode=-1):
        # not enough experiences yet
        print("compare  ", self.replay_buffer.count(), self.batch_size)
        if self.replay_buffer.count() < self.batch_size:
            return

        batch = self.replay_buffer.getBatch(self.batch_size)
        states = np.zeros((self.batch_size, self.state_dim))
        rewards = np.zeros((self.batch_size, ))
        action_mask = np.zeros((self.batch_size, self.num_actions))
        next_states = np.zeros((self.batch_size, self.state_dim))
        next_state_mask = np.zeros((self.batch_size, ))

        for k, (s0, a, r, s1, done) in enumerate(batch):
            states[k] = s0
            rewards[k] = r
            action_mask[k][a] = 1
            # check terminal state
            if not done:
                next_states[k] = s1
                next_state_mask[k] = 1

        # whether to calculate summaries
        calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

        # perform one update of training
        #direct_r, nxt_r, label_r, now_net_r, diff, norm_diff, cost, td_cost, reg_cost, _, summary_str = self.session.run([
        cost, td_cost, reg_cost, _, summary_str = self.session.run(
            [
                #self.rewards,
                #self.target_values * self.discount_factor,
                #self.future_rewards,
                #self.masked_action_scores,
                #self.temp_diff,
                #self.norm_diff,
                self.loss,
                self.td_loss,
                self.reg_loss,
                self.train_op,
                self.summarize if calculate_summaries else self.no_op
            ],
            {
                self.states: states,
                self.next_states: next_states,
                self.next_state_mask: next_state_mask,
                self.action_mask: action_mask,
                self.rewards: rewards
            })
        '''
        rewards_out = open(rewards_out_path, 'a+')
        if self.train_iteration % 100 == 0:
            for i in range(len(direct_r)):
                print("episode: ", episode, "iter: ", self.train_iteration, "mini batch ---  ", i, "direct_r ",
                      direct_r[i],
                      "nxt_r: ", nxt_r[i], "label_r: ", label_r[i], "now_net_r: ", now_net_r[i],
                      "tmpdiff: ", diff[i],
                      "norm_diff", norm_diff[i],
                      #"loss", cost[i],
                       #"state: ", states[i],
                        file=rewards_out)
            sys.stdout.flush()
        rewards_out.close()
        '''
        #if self.train_iteration % 500:
        #   print('0000 :  ', diff, file=logf)
        #  print('llll :  ', norm_diff, file=logf)
        loss_out = open(loss_out_path, "a+")
        print("episode: ",
              episode,
              "iter: ",
              self.train_iteration,
              "hjk loss is -----  ",
              cost,
              "hjk td_loss is -----  ",
              td_cost,
              "hjk reg_loss is -----  ",
              reg_cost,
              file=loss_out)
        sys.stdout.flush()
        loss_out.close()
        # update target network using Q-network
        self.session.run(self.target_network_update)
        '''
        # emit summaries
        if calculate_summaries:
            self.summary_writer.add_summary(summary_str, self.train_iteration)
        '''
        self.annealExploration()
        self.train_iteration += 1

        del batch, states, rewards, action_mask, next_states, next_state_mask
        #del direct_r, nxt_r, label_r, now_net_r, diff, norm_diff
        gc.collect()
        #objgraph.show_most_common_types(limit=50)
    def save_net(self, path):
        saver = tf.train.Saver()
        save_path = saver.save(self.session, path)
        print("Save to path: " + save_path)
Exemple #2
0
class DDPG(nn.Module):
    def __init__(
        self,
        state_dim,
        action_dim,
        learning_rate_a=1e-3,
        learning_rate_c=1e-3,
        gamma=0.99,
        update_tau=1e-3,
        batch_size=100,
        buffer_size=10000,
        training_start=1000,
    ):
        super(DDPG, self).__init__()
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr_a = learning_rate_a
        self.lr_c = learning_rate_c
        self.gamma = gamma
        self.update_tau = update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.training_start = training_start
        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.actor = Actor(input_dim=self.s_dim,
                           output_dim=self.a_dim,
                           update_tau=self.update_tau).to(self.device)
        self.critic = Critic(state_dim=self.s_dim,
                             action_dim=self.a_dim,
                             update_tau=self.update_tau).to(self.device)
        self.buffer = ReplayBuffer(buffer_size=self.buffer_size)

        self.loss_actor = 0
        self.loss_critic = 0
        self.optimizer_a = optim.Adam(self.actor.eval_net.parameters(),
                                      lr=self.lr_a)
        self.optimizer_c = optim.Adam(self.critic.parameters(), lr=self.lr_c)

    def choose_action(self, s):
        s = torch.Tensor(s).to(self.device)
        return self.actor.get_eval(s).to(
            torch.device('cpu')).detach().numpy().tolist()

    def percive(self, state, action, reward, state_, done):
        self.buffer.add(state, action, reward, state_, done)
        if self.training_start < self.buffer.count():
            self.Train()

    def get_critic_loss(self, reward, state_next, state, action, done):
        action_next = self.actor.get_target(state_next)
        q_next_tar = self.critic.get_target(s=state_next, a=action_next)
        Q_target = reward + self.gamma * q_next_tar * (1 - done)
        Q_eval = self.critic.get_eval(s=state, a=action)
        return F.mse_loss(Q_target, Q_eval)

    def Train(self):
        minibatch = self.buffer.get_batch(batch_size=self.batch_size)
        state_batch = torch.Tensor([data[0]
                                    for data in minibatch]).to(self.device)
        action_batch = torch.Tensor([data[1]
                                     for data in minibatch]).to(self.device)
        reward_batch = torch.Tensor([data[2]
                                     for data in minibatch]).to(self.device)
        state_next_batch = torch.Tensor([data[3] for data in minibatch
                                         ]).to(self.device)
        done_batch = torch.Tensor([data[4]
                                   for data in minibatch]).to(self.device)

        #train critic
        self.loss_critic = self.get_critic_loss(reward_batch, state_next_batch,
                                                state_batch, action_batch,
                                                done_batch)
        self.optimizer_c.zero_grad()
        self.loss_critic.backward()
        self.optimizer_c.step()

        #train actor
        self.loss_actor = -self.critic.get_eval(state_batch,
                                                action_batch).mean()
        self.optimizer_a.zero_grad()
        self.loss_actor.backward()
        self.optimizer_a.step()

        #update the target net
        self.actor.soft_update()
        self.critic.soft_update()
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_space, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.sess = tf.Session()

        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_space = state_space
        self.action_dim = action_dim  # 1

        self.ac_network = ActorCriticNetwork(self.sess, self.state_space,
                                             self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Get Q target label
        # maxQ(s',a')
        q_value_batch = self.ac_network.target_q(next_state_batch)

        # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a')
        y_batch = []
        batch_size = len(minibatch)
        for i in range(batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [batch_size, 1])

        # Update eval critic network by minimizing the loss L
        cost = self.ac_network.train_critic(y_batch, state_batch, action_batch)
        print('step_%d critic cost:' % self.ac_network.time_step, cost)

        # Update eval actor policy using the sampled gradient:
        self.ac_network.train_actor(state_batch)

        # Update the target networks
        self.ac_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.ac_network.actions(state)
        return action[0] + self.exploration_noise.noise()

    def action(self, state):
        action = self.ac_network.actions([state])
        return action[0]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def sparse_tensor(self, state_batch, state_space):
        row = len(state_batch)
        indices = []
        for r in range(row):
            indices += [(r, c) for c in state_batch[r]]
        values = [1.0 for i in range(len(indices))]
        return tf.SparseTensorValue(indices=indices,
                                    values=values,
                                    dense_shape=[row, state_space])
Exemple #4
0
class RDPG:
    """docstring for RDPG"""
    def __init__(self, env):
        self.name = 'RDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.saver = tf.train.Saver()

    def train(self):
        # Sample a random minibatch of N sequences from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # Construct histories
        observations = []
        next_observations = []
        actions = []
        rewards = []
        dones = []
        for each in minibatch:
            for i in range(1, len(each.observations)):
                observations.append(self.pad(each.observations[0:i]))
                next_observations.append(self.pad(each.observations[1, i + 1]))
                actions.append(each.actions[0:i - 1])
                rewards.append(each.rewards[0:i])
                if i == len(each.observations) - 1:
                    dones.append(True)
                else:
                    dones.append(False)
        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(observations)
        q_value_batch = self.critic_network.target_q(
            next_observations,
            [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)])
        y_batch = []
        for i in range(len(observations)):
            if dones[i]:
                y_batch.append(rewards[i][-1])
            else:
                y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [len(observations), 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, observations,
                                  [self.pad(i) for i in actions])

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(observations)
        q_gradient_batch = self.critic_network.gradients(
            observations, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, observations)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def save_model(self, path, episode):
        self.saver.save(self.sess, path + "modle.ckpt", episode)

    def noise_action(self, history):
        # Select action a_t according to a sequence of observation and action
        action = self.actor_network.action(history)
        return action + self.exploration_noise.noise()

    def action(self, history):
        action = self.actor_network.action(history)
        return action

    def perceive(self, history):
        # Store the history sequence in the replay buffer
        self.replay_buffer.add(history)

        # Store history to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def pad(self, input):
        dim = len(input[0])
        return input + [[0] * dim] * (1000 - len(input))
def main(args):
    if VERBOSE:
        print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.'

    state_dim = 3
    action_dim = 1

    net = ActorCriticNet(state_dim, action_dim)

    target_net = copy.deepcopy(net)
    memory = ReplayBuffer(REPLAY_BUFFER_SIZE)
    noise = OUNoise(action_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2)
    target_optim = optim.Optimizer(target_net.parameters(),
                                   {})  # to iterate over target params

    if VERBOSE: print '***Making gym env (only used to setup TF net).'

    # load tf net (restoring saved parameters)
    dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')),
                          loadfilename='tf_params-0',
                          printVars=False)

    if VERBOSE: print '***TF net restore complete.'

    # load control data (only using a every fourth data), and tf net results
    control_states = np.load('control_states.npy')[::4]
    control_rewards = np.load('control_rewards.npy')[::4]
    tf_record = np.load('tf_control_record.npy')

    # replace torch params with tf params, and run control data, collecting torch net results
    # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch
    # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences)
    replaceNetParams(dtf, net, target_net)

    if VERBOSE: print '***Torch net params initialized to TF net params.'

    original_net = copy.deepcopy(net)  # save original net
    original_target_net = copy.deepcopy(target_net)

    torch_record = []

    loss = -1
    first_step = True

    for i in xrange(len(control_rewards) - 1):
        state = torch.from_numpy(control_states[i].reshape(1,
                                                           state_dim)).float()
        action = net.getAction(Variable(state)).data
        target_action = target_net.getAction(Variable(state)).data

        reward = torch.FloatTensor([[control_rewards[i]]]).float()

        new_state = torch.from_numpy(control_states[i + 1].reshape(
            1, state_dim)).float()

        memory.add(state, action, reward, new_state, True)
        if memory.count() > REPLAY_START_SIZE:
            minibatch = memory.get_batch(BATCH_SIZE)
            state_batch = torch.cat([data[0] for data in minibatch], dim=0)
            action_batch = torch.cat([data[1] for data in minibatch], dim=0)
            reward_batch = torch.cat([data[2] for data in minibatch])
            next_state_batch = torch.cat([data[3] for data in minibatch],
                                         dim=0)
            done_batch = Tensor([data[4] for data in minibatch])

            # calculate y_batch from targets
            #next_action_batch = target_net.getAction(Variable(next_state_batch))
            value_batch = target_net.getValue(Variable(next_state_batch)).data
            y_batch = reward_batch + GAMMA * value_batch * done_batch

            if first_step:
                if VERBOSE: print '***First Optimization Step complete.'
                torch_ys = y_batch
                torch_batch = minibatch
                torch_outs = net.getValue(Variable(state_batch)).data

            # optimize net 1 step
            loss = criterion(net.getValue(Variable(state_batch)),
                             Variable(y_batch))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss = loss.data[0]

            # update targets - using exponential moving averages
            for group, target_group in zip(optimizer.param_groups,
                                           target_optim.param_groups):
                for param, target_param in zip(group['params'],
                                               target_group['params']):
                    target_param.data.mul_(1 - TAU)
                    target_param.data.add_(TAU, param.data)

            if first_step:
                first_step_net = copy.deepcopy(net)
                first_step_target_net = copy.deepcopy(target_net)
                first_step = False

        torch_record.append(
            [action.numpy()[0][0],
             target_action.numpy()[0][0], loss])
        loss = -1

    torch_record = np.array(torch_record)
    torch_outs = torch_outs.numpy().T[0]
    torch_ys = torch_ys.numpy().T[0]

    if VERBOSE: print '***Control Data run complete.'

    # compare torch and tf results
    # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)]
    sel = np.arange(45, 55)
    #print calc_error(tf_record[sel,:], torch_record[sel,:])
    print 'Result comparison:'
    print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss'
    print np.hstack(
        [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]])
    print '\t(a loss of -1 means no training occured in that step)'

    # load all tf results from before taking first optimization step
    tf_ys = np.load('tf_first_step_y_batch.npy')
    tf_rs = np.load('tf_first_step_reward_batch.npy')
    tf_ds = np.load('tf_first_step_done_batch.npy')
    tf_vs = np.load('tf_first_step_value_batch.npy')
    tf_outs = np.load('tf_first_step_output_values.npy')
    torch_wd = 1.36607  # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net

    if VERBOSE:
        print '***Comparing first step stats'

        # compare tf and torch data from before taking first optimization step
        # including calculation of manual loss
        print '\terror in ys (between tf and torch)', calc_error(
            torch_ys, tf_ys)
        print '\terror in predictions (between tf and torch)', calc_error(
            torch_outs, tf_outs)
        print '\ttorch loss (manually calculated)', np.mean(
            (torch_ys - torch_outs)**2)
        print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2)
        print '\ttorch loss', torch_record[50,
                                           2], '(not including weight decay)'
        print '\ttf loss', tf_record[
            50, 2] - torch_wd, '(not including weight decay)'

    return 0
class NeuralQLearner(object):

  def __init__(self, session,
                     optimizer,
                     q_network,
                     state_dim,
                     num_actions,
                     batch_size=32,
                     init_exp=0.5,       # initial exploration prob
                     final_exp=0.1,      # final exploration prob
                     anneal_steps=10000, # N steps for annealing exploration 
                     replay_buffer_size=10000,
                     store_replay_every=5, # how frequent to store experience
                     discount_factor=0.9, # discount future rewards
                     target_update_rate=0.01,
                     reg_param=0.01, # regularization constants
                     max_gradient=5, # max gradient norms
                     double_q_learning=False,
                     summary_writer=None,
                     summary_every=100):

    # tensorflow machinery
    self.session        = session
    self.optimizer      = optimizer
    self.summary_writer = summary_writer

    # model components
    self.q_network     = q_network
    self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size)

    # Q learning parameters
    self.batch_size      = batch_size
    self.state_dim       = state_dim
    self.num_actions     = num_actions
    self.exploration     = init_exp
    self.init_exp        = init_exp
    self.final_exp       = final_exp
    self.anneal_steps    = anneal_steps
    self.discount_factor = discount_factor
    self.target_update_rate = target_update_rate
    self.double_q_learning = double_q_learning

    # training parameters
    self.max_gradient = max_gradient
    self.reg_param    = reg_param

    # counters
    self.store_replay_every   = store_replay_every
    self.store_experience_cnt = 0
    self.train_iteration      = 0

    # create and initialize variables
    self.create_variables()
    var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
    self.session.run(tf.initialize_variables(var_lists))

    # make sure all variables are initialized
    self.session.run(tf.assert_variables_initialized())

    if self.summary_writer is not None:
      # graph was not available when journalist was created
      self.summary_writer.add_graph(self.session.graph)
      self.summary_every = summary_every

  def create_variables(self):
    # compute action from a state: a* = argmax_a Q(s_t,a)
    with tf.name_scope("predict_actions"):
      # raw state representation
      self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
      # initialize Q network
      with tf.variable_scope("q_network"):
        self.q_outputs = self.q_network(self.states)
      # predict actions from Q network
      self.action_scores = tf.identity(self.q_outputs, name="action_scores")
      tf.histogram_summary("action_scores", self.action_scores)
      self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions")

    # estimate rewards using the next state: r(s_t,a_t) + argmax_a Q(s_{t+1}, a)
    with tf.name_scope("estimate_future_rewards"):
      self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")
      self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks")

      if self.double_q_learning:
        # reuse Q network for action selection
        with tf.variable_scope("q_network", reuse=True):
          self.q_next_outputs = self.q_network(self.next_states)
        self.action_selection = tf.argmax(tf.stop_gradient(self.q_next_outputs), 1, name="action_selection")
        tf.histogram_summary("action_selection", self.action_selection)
        self.action_selection_mask = tf.one_hot(self.action_selection, self.num_actions, 1, 0)
        # use target network for action evaluation
        with tf.variable_scope("target_network"):
          self.target_outputs = self.q_network(self.next_states) * tf.cast(self.action_selection_mask, tf.float32)
        self.action_evaluation = tf.reduce_sum(self.target_outputs, reduction_indices=[1,])
        tf.histogram_summary("action_evaluation", self.action_evaluation)
        self.target_values = self.action_evaluation * self.next_state_mask
      else:
        # initialize target network
        with tf.variable_scope("target_network"):
          self.target_outputs = self.q_network(self.next_states)
        # compute future rewards
        self.next_action_scores = tf.stop_gradient(self.target_outputs)
        self.target_values = tf.reduce_max(self.next_action_scores, reduction_indices=[1,]) * self.next_state_mask
        tf.histogram_summary("next_action_scores", self.next_action_scores)

      self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")
      self.future_rewards = self.rewards + self.discount_factor * self.target_values

    # compute loss and gradients
    with tf.name_scope("compute_temporal_differences"):
      # compute temporal difference loss
      self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask")
      self.masked_action_scores = tf.reduce_sum(self.action_scores * self.action_mask, reduction_indices=[1,])
      self.temp_diff = self.masked_action_scores - self.future_rewards
      self.td_loss = tf.reduce_mean(tf.square(self.temp_diff))
      # regularization loss
      q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
      self.reg_loss = self.reg_param * tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in q_network_variables])
      # compute total loss and gradients
      self.loss = self.td_loss + self.reg_loss
      gradients = self.optimizer.compute_gradients(self.loss)
      # clip gradients by norm
      for i, (grad, var) in enumerate(gradients):
        if grad is not None:
          gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var)
      # add histograms for gradients.
      for grad, var in gradients:
        tf.histogram_summary(var.name, var)
        if grad is not None:
          tf.histogram_summary(var.name + '/gradients', grad)
      self.train_op = self.optimizer.apply_gradients(gradients)

    # update target network with Q network
    with tf.name_scope("update_target_network"):
      self.target_network_update = []
      # slowly update target network parameters with Q network parameters
      q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
      target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network")
      for v_source, v_target in zip(q_network_variables, target_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)
      self.target_network_update = tf.group(*self.target_network_update)

    # scalar summaries
    tf.scalar_summary("td_loss", self.td_loss)
    tf.scalar_summary("reg_loss", self.reg_loss)
    tf.scalar_summary("total_loss", self.loss)
    tf.scalar_summary("exploration", self.exploration)

    self.summarize = tf.merge_all_summaries()
    self.no_op = tf.no_op()

  def storeExperience(self, state, action, reward, next_state, done):
    # always store end states
    if self.store_experience_cnt % self.store_replay_every == 0 or done:
      self.replay_buffer.add(state, action, reward, next_state, done)
    self.store_experience_cnt += 1

  def eGreedyAction(self, states, explore=True):
    if explore and self.exploration > random.random():
      return random.randint(0, self.num_actions-1)
    else:
      return self.session.run(self.predicted_actions, {self.states: states})[0]

  def annealExploration(self, stategy='linear'):
    ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0)
    self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp

  def updateModel(self):
    # not enough experiences yet
    if self.replay_buffer.count() < self.batch_size:
      return

    batch           = self.replay_buffer.getBatch(self.batch_size)
    states          = np.zeros((self.batch_size, self.state_dim))
    rewards         = np.zeros((self.batch_size,))
    action_mask     = np.zeros((self.batch_size, self.num_actions))
    next_states     = np.zeros((self.batch_size, self.state_dim))
    next_state_mask = np.zeros((self.batch_size,))

    for k, (s0, a, r, s1, done) in enumerate(batch):
      states[k] = s0
      rewards[k] = r
      action_mask[k][a] = 1
      # check terminal state
      if not done:
        next_states[k] = s1
        next_state_mask[k] = 1

    # whether to calculate summaries
    calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

    # perform one update of training
    cost, _, summary_str = self.session.run([
      self.loss,
      self.train_op,
      self.summarize if calculate_summaries else self.no_op
    ], {
      self.states:          states,
      self.next_states:     next_states,
      self.next_state_mask: next_state_mask,
      self.action_mask:     action_mask,
      self.rewards:         rewards
    })

    # update target network using Q-network
    self.session.run(self.target_network_update)

    # emit summaries
    if calculate_summaries:
      self.summary_writer.add_summary(summary_str, self.train_iteration)

    self.annealExploration()
    self.train_iteration += 1
Exemple #7
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')

    def train(self):
        # if self.time_step ==0:
        #     print("Begins Training!!!")
        #print("Training Begins")
        self.time_step += 1
        """Sample a random minibatch of N transitions from replay buffer"""
        """take out BATCH_SIZE sets of data"""
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        """resize the action_batch shape to  [BATCH_SIZE, self.action_dim]"""
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        """Calculate y_batch(reward)"""
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        """Update critic by minimizing the loss L (training)"""
        self.critic_network.train(y_batch, state_batch, action_batch)
        """Update the actor policy using the sampled gradient:"""
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)
        """Update the target networks"""
        self.actor_network.update_target()
        self.critic_network.update_target()
        #print("Training Finished")

    def noise_action(self, state):
        """Select action a_t according to the current policy and exploration noise"""
        action = self.actor_network.action(state)
        exp_noise = self.exploration_noise.noise()
        action += exp_noise
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def action(self, state):
        action = self.actor_network.action(state)
        # action[0] = np.clip(action[0], 0, 1)
        # action[1] = np.clip(action[1], -1, 1)
        return action

    def perceive(self, state, action, reward, next_state, done):
        """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
        """Store transitions to replay start size then start training"""
        # if self.replay_buffer.count() % 1000 == 0:
        #     print("The buffer count is ", self.replay_buffer.count())
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
            # self.atten_rate *= 0.99995
            if not self.threading.is_alive():
                self.threading = threading.Thread(target=self.train,
                                                  name='LoopThread--DDPG')
                self.threading.start()
            """SAVE NETWORK"""
            if self.time_step % 100 == 0:
                print("Training_time_step:", self.time_step)
            if self.time_step % 1000 == 0:
                print("!!!!!!!save model success!!!!!!!!")
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)
        """Re-iniitialize the random process when an episode ends"""
        if done:
            self.exploration_noise.reset()
Exemple #8
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN)
        print("######### TRAINING   #############")
        for k in range(Hp.N_TRAIN):
            minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size]
            state_batch_r = np.asarray([data[0] for data in minibatch])
            state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(state_batch_r[:, j], axis=0)
                state_batch.append(new_cat)
            #state_batch = [np.expand_dims(state_batch, axis=1)]
            action_batch = np.asarray([data[1] for data in minibatch])
            reward_batch = np.asarray([data[2] for data in minibatch])
            next_state_batch_r = np.asarray([data[3] for data in minibatch])
            next_state_batch = []
            for j in range(Hp.categories):
                new_cat = np.stack(next_state_batch_r[:, j], axis=0)
                next_state_batch.append(new_cat)
            #next_state_batch = [np.expand_dims(next_state_batch, axis=1)]
            done_batch = np.asarray([data[4] for data in minibatch])

            # for action_dim = 1
            action_batch = np.resize(action_batch,
                                     [Hp.batch_size, self.action_dim])

            next_action_batch = self.actor_network.target_actions(
                self.target_state_input, next_state_batch)
            q_value_batch = self.critic_network.target_q(
                self.target_state_input, next_state_batch, next_action_batch)
            y_batch = []

            for i in range(len(minibatch)):
                if done_batch[i]:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   Hp.GAMMA * q_value_batch[i])

            y_batch = np.resize(y_batch, [Hp.batch_size, 1])

            # Update critic by minimizing the loss L
            self.critic_network.train(y_batch, self.state_input, state_batch,
                                      action_batch)

            # Update the actor policy using the sampled gradient:
            action_batch_for_gradients = self.actor_network.actions(
                self.state_input, state_batch)
            q_gradient_batch = self.critic_network.gradients(
                self.state_input, state_batch, action_batch_for_gradients)

            self.summary_str2 = self.actor_network.train(
                q_gradient_batch, self.state_input, state_batch)

            # Update the target networks
            self.actor_network.update_target()
            self.critic_network.update_target()
            self.state_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        print("no noise ", action)
        return np.clip(
            action +
            self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]),
            [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0])

    def action(self, state):
        state = [np.expand_dims(el, axis=0) for el in state]
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > Hp.REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #9
0
def main(config_dict):
    train = config_dict['train']
    network = config_dict['network']
    experiment_name = config_dict['experiment_name']
    EXPERIMENTS_PATH = config_dict['EXPERIMENTS_PATH']

    actor_weights_file = "%s%s/%s_actor.h5" % (EXPERIMENTS_PATH, network,
                                               network)
    critic_weights_file = "%s%s/%s_critic.h5" % (EXPERIMENTS_PATH, network,
                                                 network)

    log_directory = "%s%s/%s/" % (EXPERIMENTS_PATH, network, experiment_name)

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001
    LRA = 0.0001
    LRC = 0.001

    action_dim = 3  # Steering / Acceleration / Blake
    state_dim = 29  # Dimension of sensor inputs

    #np.random.seed(42)

    vision = False
    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    done = False
    step = 0
    epsilon = 1

    exp_logger = TORCS_ExperimentLogger(log_directory, experiment_name)

    #directory = "%s%s/" % (EXPERIMENTS_PATH, experiment)
    #actor_weights_file = "%s%s_%s" % (directory, experiment, "actor.h5")
    #critic_weights_file = "%s%s_%s" % (directory, experiment, "critic.h5")

    # TensorFlow GPU
    config = tf.ConfigProto()
    # Not sure if this is really necessary, since we only have a single GPU
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    from keras import backend as K
    K.set_session(sess)

    actor = ActorFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticFCNet(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)

    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Weight loading
    if not train:
        try:
            actor.model.load_weights(actor_weights_file)
            critic.model.load_weights(critic_weights_file)
            actor.target_model.load_weights(actor_weights_file)
            critic.target_model.load_weights(critic_weights_file)
            print "Weights loaded successfully"
            time.sleep(2)
        except:
            print "Error in loading weights"
            print '-' * 60
            traceback.print_exc(file=sys.stdout)
            print '-' * 60
            assert (False)

    for i in xrange(episode_count):
        print "Episode: %i; Replay Buffer: %i" % (i, buff.count())

        if np.mod(i, 3) == 0:
            # Relaunch TORCS every 3 episodes; memory leak error
            ob = env.reset(relaunch=True)
        else:
            ob = env.reset()

        state_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        # Compute rewards
        for j in xrange(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE  # exploration factor
            action_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            action_t_raw = actor.model.predict(
                state_t.reshape(
                    1,
                    state_t.shape[0]))  # this call to reshape seems suboptimal

            noise_t[0][0] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train * max(epsilon, 0) * OU.run(
                action_t_raw[0][2], -0.1, 1.00, 0.05)

            # stochastic brake
            #if random.random() <= 0.1:
            #    noise_t[0][2] = train * max(epsilon, 0) * OU.run(action_t_raw[0][2], 0.2, 1.00, 0.10)

            # May be able to do this a bit more concisely with NumPy vectorization
            action_t[0][0] = action_t_raw[0][0] + noise_t[0][0]
            action_t[0][1] = action_t_raw[0][1] + noise_t[0][1]
            action_t[0][2] = action_t_raw[0][2] + noise_t[0][2]

            # Raw_reward_t is the raw reward computed by the gym_torcs script.
            # We will compute our own reward metric from the ob object
            ob, raw_reward_t, done, info = env.step(action_t[0])

            state_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
            #reward_t = lng_trans(ob)
            reward_t = raw_reward_t

            buff.add(state_t, action_t[0], reward_t, state_t1,
                     done)  # Add replay buffer

            # Batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            done_indicators = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            # Can't we just use BATCH_SIZE here
            for k in xrange(len(batch)):
                if done_indicators[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_net()
                critic.train_target_net()

            exp_logger.log(ob, action_t[0], reward_t, loss)

            total_reward += reward_t
            state_t = state_t1

            print("Episode", i, "Step", step, "Action", action_t, "Reward",
                  reward_t, "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train):
                print("Now we save model")
                actor.model.save_weights(actor_weights_file, overwrite=True)
                #with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights(critic_weights_file, overwrite=True)
                #with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Exemple #10
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state, epsilon):
        action = self.actor_network.action(state)
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.linear_noise.noise()
        noise_t[1] = epsilon * self.angular_noise.noise()
        action = action + noise_t
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)
        #print(a_linear, a_angular)

        return [a_linear, a_angular]

    def action(self, state):
        action = self.actor_network.action(state)
        a_linear = np.clip(action[0], 0, 1)
        a_linear = round(a_linear, 1)
        a_angular = np.clip(action[1], -1, 1)
        a_angular = round(a_angular, 1)

        return [a_linear, a_angular]

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        if done:
            self.linear_noise.reset()
            self.angular_noise.reset()

        return self.time_step
Exemple #11
0
class DDQN:
    def __init__(self, model_name, action_dim):
        self.device = configure.DEVICE
        self.model_name = model_name
        self.action_dim = action_dim
        self.episode = 0
        # self.timeStep = 0
        self.STARTtrain = False
        self.epsilon = INITIAL_EPSILON

        self.img_width = configure.IMAGE_WIDTH
        self.img_height = configure.IMAGE_HEIGHT
        self.img_channels = configure.STACKED_FRAMES * 4

        self.learning_rate = configure.LEARNING_RATE_START
        self.tau = configure.TargetNet_Tau

        self.replaybuffer = ReplayBuffer(REPLAY_MEMORY)

        self.graph = tf.Graph()
        with self.graph.as_default() as g:
            with tf.device(self.device):
                with tf.variable_scope('Main_net'):
                    self.imageIn, self.conv1, self.conv2, self.conv3, self.pool1, self.conv4, \
                    self.Advantage, self.Value, self.Qout, self.predict \
                        = self.__create_graph()

                with tf.variable_scope('Target_net'):
                    self.imageInT, _, _, _, _, _, _, _, self.QoutT, _ = self.__create_graph(
                    )

                self.MainNet_vars = get_variables('Main_net')
                self.TargetNet_vars = get_variables('Target_net')
                self.createTrainingMethod()
                self.createupdateTargetNetOp()

                self.sess = tf.Session(
                    graph=self.graph,
                    config=tf.ConfigProto(
                        allow_soft_placement=True,
                        log_device_placement=False,
                        gpu_options=tf.GPUOptions(allow_growth=True)))
                self.sess.run(tf.global_variables_initializer())

                if configure.TENSORBOARD:
                    self._create_tensor_board()
                # if configure.LOAD_CHECKPOINT or configure.SAVE_MODELS:
                #     vars = tf.global_variables()
                #     self.saver = tf.train.Saver({var.name: var for var in vars}, max_to_keep=0)

                self.saver = tf.train.Saver()

                checkpoint = tf.train.get_checkpoint_state(self.model_name)
                if checkpoint and checkpoint.model_checkpoint_path:
                    self.saver.restore(self.sess,
                                       checkpoint.model_checkpoint_path)
                    print "Successfully loaded:", checkpoint.model_checkpoint_path
                    mypath = str(checkpoint.model_checkpoint_path)
                    stepmatch = re.split('-', mypath)[2]
                    self.episode = int(stepmatch)
                # pass
                else:
                    print "Could not find old network weights"

    # def __create_main_graph(self):
    #     self.imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='imgIn')
    #
    #     self.conv1 = self.conv2d_layer(self.imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1])
    #     self.conv2 = self.conv2d_layer(self.conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1])
    #     self.conv3 = self.conv2d_layer(self.conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
    #     self.conv4 = self.conv2d_layer(self.conv3, self.conv3.get_shape()[1].value, 512, 'conv4', strides=[1,1,1,1])
    #     with tf.variable_scope('A_V'):
    #         self.streamAC, self.streamVC = tf.split(self.conv4, 2, 3)
    #         self.streamA = tf.contrib.layers.flatten(self.streamAC)
    #         self.streamV = tf.contrib.layers.flatten(self.streamVC)
    #
    #         self.AW = tf.Variable(tf.random_normal([self.streamA, self.action_dim]), name='AW')
    #         self.VW = tf.Variable(tf.random_normal([self.streamV, 1]), name='VW')
    #         self.Advantage = tf.matmul(self.streamA, self.AW, name='Advantage')
    #         self.Value = tf.matmul(self.streamV, self.VW, name='Value')
    #
    #     with tf.variable_scope('Qout'):
    #         self.Qout = self.Value + tf.subtract(
    #             self.Advantage, tf.reduce_mean(self.Advantage, reduction_indices=1, keep_dims=True))
    #
    #     with tf.variable_scope('Predict'):
    #         self.predict = tf.argmax(self.Qout, 1)

    def __create_graph(self):
        imageIn = tf.placeholder(
            tf.float32,
            [None, self.img_height, self.img_width, self.img_channels],
            name='imgIn')

        conv1 = self.conv2d_layer(imageIn,
                                  8,
                                  128,
                                  'conv1',
                                  strides=[1, 4, 4, 1])
        conv2 = self.conv2d_layer(conv1, 4, 128, 'conv2', strides=[1, 2, 2, 1])
        conv3 = self.conv2d_layer(conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
        pool1 = self.mpool_layer(conv3, 2, [1, 2, 2, 1], name='pool1')
        conv4 = self.conv2d_layer(pool1,
                                  pool1.get_shape()[1].value,
                                  1024,
                                  'conv4',
                                  strides=[1, 1, 1, 1],
                                  padding='VALID')

        streamAC, streamVC = tf.split(conv4, 2, 3)
        streamA = tf.contrib.layers.flatten(streamAC)
        streamV = tf.contrib.layers.flatten(streamVC)

        Advantage = self.fc_layer(streamA,
                                  self.action_dim,
                                  'Advantage',
                                  func=None)
        Value = self.fc_layer(streamV, 1, 'Value', func=None)

        # AW = tf.Variable(tf.random_normal([streamA.get_shape()[1].value, self.action_dim]), name='AW')
        # VW = tf.Variable(tf.random_normal([streamV.get_shape()[1].value, 1]), name='VW')
        # Advantage = tf.matmul(streamA, AW, name='Advantage')
        # Value = tf.matmul(streamV, VW, name='Value')
        with tf.variable_scope('Qout'):
            Qout = Value + tf.subtract(
                Advantage,
                tf.reduce_mean(Advantage, reduction_indices=1, keep_dims=True))
        with tf.variable_scope('Predict'):
            predict = tf.argmax(Qout, 1)

        return imageIn, conv1, conv2, conv3, pool1, conv4, Advantage, Value, Qout, predict

    # def __create_target_graph(self):
    #     self.target_imageIn = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels],
    #                                   name='imgIn')
    #     self.target_conv1 = self.conv2d_layer(self.target_imageIn, 8, 32, 'conv1', strides=[1, 4, 4, 1])
    #     self.target_conv2 = self.conv2d_layer(self.target_conv1, 4, 64, 'conv2', strides=[1, 2, 2, 1])
    #     self.target_conv3 = self.conv2d_layer(self.target_conv2, 3, 128, 'conv3', strides=[1, 1, 1, 1])
    #     self.target_conv4 = self.conv2d_layer(self.target_conv3, self.target_conv3.get_shape()[1].value, 512, 'conv4', strides=[1, 1, 1, 1])
    #     with tf.variable_scope('A_V'):
    #         self.target_streamAC, self.target_streamVC = tf.split(self.target_conv4, 2, 3)
    #         self.target_streamA = tf.contrib.layers.flatten(self.target_streamAC)
    #         self.target_streamV = tf.contrib.layers.flatten(self.target_streamVC)
    #
    #         self.target_AW = tf.Variable(tf.random_normal([self.target_streamA, self.action_dim]), name='AW')
    #         self.target_VW = tf.Variable(tf.random_normal([self.target_streamV, 1]), name='VW')
    #         self.target_Advantage = tf.matmul(self.target_streamA, self.target_AW, name='Advantage')
    #         self.target_Value = tf.matmul(self.target_streamV, self.target_VW, name='Value')
    #
    #     with tf.variable_scope('Qout'):
    #         self.Qout = self.target_Value + tf.subtract(
    #             self.target_Advantage, tf.reduce_mean(self.target_Advantage, reduction_indices=1, keep_dims=True))

    def createTrainingMethod(self):
        self.global_step = tf.Variable(0, trainable=False, name='step')
        self.var_learning_rate = tf.placeholder(tf.float32,
                                                name='lr',
                                                shape=[])
        self.targetQ = tf.placeholder(shape=[None],
                                      dtype=tf.float32,
                                      name='targetQ')
        self.actions = tf.placeholder(shape=[None],
                                      dtype=tf.int32,
                                      name='actions')
        self.actions_onehot = tf.one_hot(self.actions,
                                         self.action_dim,
                                         dtype=tf.float32,
                                         name='act_onehot')
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot),
                               reduction_indices=1,
                               name='Q')
        self.td_error = tf.square(self.targetQ - self.Q, name='td_error')
        self.loss = tf.reduce_mean(self.td_error, name='loss')
        self.trainer = tf.train.AdamOptimizer(
            learning_rate=self.var_learning_rate)
        self.train_op = self.trainer.minimize(self.loss,
                                              global_step=self.global_step,
                                              name='train_update')

    def createupdateTargetNetOp(self):
        self.assign_op = {}
        for from_, to_ in zip(self.MainNet_vars, self.TargetNet_vars):
            self.assign_op[to_.name] = to_.assign(self.tau * from_ +
                                                  (1 - self.tau) * to_)

    def updateTargetNet(self):
        for var in self.TargetNet_vars:
            self.sess.run(self.assign_op[var.name])

    def conv2d_layer(self,
                     input,
                     filter_size,
                     out_dim,
                     name,
                     strides,
                     func=tf.nn.relu,
                     padding='SAME'):
        in_dim = input.get_shape()[-1].value
        # in_dim = input.get_shape()[-1].value
        d = 1.0 / np.sqrt(filter_size * filter_size * in_dim)
        with tf.variable_scope(name):
            w_init = tf.random_uniform_initializer(-d, d)
            b_init = tf.random_uniform_initializer(-d, d)
            w = tf.get_variable(
                'w',
                shape=[filter_size, filter_size, in_dim, out_dim],
                dtype=tf.float32,
                initializer=w_init)
            b = tf.get_variable('b', shape=[out_dim], initializer=b_init)

            output = tf.nn.conv2d(input, w, strides=strides,
                                  padding=padding) + b
            if func is not None:
                output = func(output)

        return output

    def mpool_layer(self, input_op, mpool_size, strides, name):
        with tf.variable_scope(name):
            output = tf.nn.max_pool(input_op,
                                    ksize=[1, mpool_size, mpool_size, 1],
                                    strides=strides,
                                    padding="SAME")
        return output

    def fc_layer(self, input, out_dim, name, func=tf.nn.relu):
        in_dim = input.get_shape()[-1].value
        d = 1.0 / np.sqrt(in_dim)
        with tf.variable_scope(name):
            w_init = tf.random_uniform_initializer(-d, d)
            b_init = tf.random_uniform_initializer(-d, d)
            w = tf.get_variable('w',
                                dtype=tf.float32,
                                shape=[in_dim, out_dim],
                                initializer=w_init)
            b = tf.get_variable('b',
                                dtype=tf.float32,
                                shape=[out_dim],
                                initializer=b_init)

            output = tf.matmul(input, w) + b
            if func is not None:
                output = func(output)

        return output

    def _create_tensor_board(self):
        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summaries.append(tf.summary.scalar("Loss", self.loss))
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram("W_%s" % var.name, var))

        summaries.append(tf.summary.histogram("conv1", self.conv1))
        summaries.append(tf.summary.histogram("conv2", self.conv2))
        summaries.append(tf.summary.histogram("conv3", self.conv3))
        summaries.append(tf.summary.histogram("pool1", self.pool1))
        summaries.append(tf.summary.histogram("conv4", self.conv4))
        summaries.append(tf.summary.histogram("Advantage", self.Advantage))
        summaries.append(tf.summary.histogram("Value", self.Value))
        summaries.append(tf.summary.histogram("Qout", self.Qout))
        summaries.append(tf.summary.histogram("Q", self.Q))

        self.summary_op = tf.summary.merge(summaries)
        self.log_writer = tf.summary.FileWriter("logs/%s" % self.model_name,
                                                self.sess.graph)

    def log(self, y_batch, action_batch, state_batch):
        feed_dict = {
            self.targetQ: y_batch,
            self.actions: action_batch,
            self.imageIn: state_batch,
            self.var_learning_rate: self.learning_rate
        }
        step, summary = self.sess.run([self.global_step, self.summary_op],
                                      feed_dict=feed_dict)
        self.log_writer.add_summary(summary, step)

    def trainQNetwork(self):
        minibatch = self.replaybuffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        action_batch = np.resize(action_batch, [BATCH_SIZE])

        A = self.sess.run(self.predict,
                          feed_dict={self.imageIn: next_state_batch})
        Q = self.sess.run(self.QoutT,
                          feed_dict={self.imageInT: next_state_batch})
        doubleQ = Q[range(BATCH_SIZE), A]
        targetQ = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                targetQ.append(reward_batch[i])
            else:
                targetQ.append(reward_batch[i] + GAMMA * doubleQ[i])
        # targetQ = np.resize(targetQ, [BATCH_SIZE, 1])
        self.sess.run(self.train_op,
                      feed_dict={
                          self.imageIn: state_batch,
                          self.targetQ: targetQ,
                          self.actions: action_batch,
                          self.var_learning_rate: self.learning_rate
                      })

        self.updateTargetNet()

        if self.episode % configure.SAVE_NET == 0 and self.episode != 0:
            self.saver.save(self.sess,
                            self.model_name + '/network' + '-dqn',
                            global_step=self.episode)

        if configure.TENSORBOARD and self.episode % configure.TENSORBOARD_UPDATE_FREQUENCY == 0 and self.episode != 0:
            self.log(targetQ, action_batch, state_batch)

        self.episode += 1
        self.STARTtrain = True

    def setPerception(self, nextObservation, action, reward, terminal):
        newState = np.concatenate(
            (self.currentState[:, :, 4:], nextObservation), axis=2)
        self.replaybuffer.add(self.currentState, action, reward, newState,
                              terminal)
        # self.replayMemory.append((self.currentState, action, reward, newState, terminal))
        if self.episode <= OBSERVE:
            state = "observe"
        elif self.episode > OBSERVE and self.episode <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        if self.episode % 100 == 0 and self.STARTtrain:
            print "episode", self.episode , "/ STATE", state, \
                "/ EPSILON", self.epsilon

        self.currentState = newState

    def Perce_Train(self):
        if self.replaybuffer.count() > configure.REPLAY_START_SIZE:
            self.trainQNetwork()

    def getAction(self):
        if np.random.rand(1) < self.epsilon:
            action_get = np.random.randint(0, self.action_dim)
        else:
            action_get = self.sess.run(
                self.predict, feed_dict={self.imageIn: [self.currentState]})

        if self.epsilon > FINAL_EPSILON and self.episode > OBSERVE:
            self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        return action_get

    def setInitState_rgb(self, observation):
        self.currentState = observation
        for i in xrange(configure.STACKED_FRAMES - 1):
            self.currentState = np.concatenate(
                (self.currentState, observation), axis=2)
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim):
        self.time_step = 1
        self.memory = ReplayBuffer(MEMORY_CAPACITY)
        self.exploration_noise = OUNoise(a_dim)
        self.pointer = 0
        self.sess = tf.Session()
        writer = tf.summary.FileWriter("logs/", self.sess.graph)

        self.a_dim, self.s_dim, self.a_bound, self.m_dim, self.pixel_meter, self.att_dim = \
            a_dim, s_dim, a_bound, m_dim, pixel_meter, att_dim
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.GM = tf.placeholder(tf.float32, [None, m_dim, m_dim, 1], 'gm')
        self.LM = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm')
        self.LM_ = tf.placeholder(tf.int32, [None, att_dim*2+1, att_dim*2+1, 4], 'lm_')

        self.a = self._build_a(self.S, self.GM, self.LM, )
        q = self._build_c(self.S, self.GM, self.LM, self.a, )
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)  # soft replacement

        def ema_getter(getter, name, *args, **kwargs):
            return ema.average(getter(name, *args, **kwargs))

        target_update = [ema.apply(a_params), ema.apply(c_params)]  # soft update operation
        a_ = self._build_a(self.S_, self.GM, self.LM_, reuse=True, custom_getter=ema_getter)  # replaced target parameters
        q_ = self._build_c(self.S_, self.GM, self.LM_, a_, reuse=True, custom_getter=ema_getter)

        a_loss = - tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)

        with tf.control_dependencies(target_update):  # soft replacement happened at here
            q_target = self.R + GAMMA * q_
            td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
            self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)

        self.sess.run(tf.global_variables_initializer())

    def noise_action(self, s1, gm1, loc1):
        locm = np.zeros([1, self.att_dim*2+1, self.att_dim*2+1, 4])
        for j in range(self.att_dim * 2 + 1):
            for k in range(self.att_dim * 2 + 1):
                locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0])
        return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis],
                                      self.LM: locm})[0] + self.exploration_noise.noise()

    def action(self, s1, gm1, loc1):
        locm = np.zeros([1, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4])
        for j in range(self.att_dim * 2 + 1):
            for k in range(self.att_dim * 2 + 1):
                locm[0, j, k, :] = np.array([0, loc1[0] - self.att_dim + j, loc1[1] - self.att_dim + k, 0])
        return self.sess.run(self.a, {self.S: s1[np.newaxis, :], self.GM: gm1[np.newaxis, :, :, np.newaxis],
                                      self.LM: locm})[0]

    def perceive(self, sd, p, loc, s, a_store, r, s_, loc_, done):
        self.memory.add(sd, p, loc, s, a_store, r, s_, loc_, done)
        if self.memory.count() > REPLAY_START:
            self.learn()
        if self.time_step % 500000 == 0:
            self.save_network()

    def learn(self):
        self.time_step += 1
        replay = self.memory.get_batch(BATCH_SIZE)
        bm_sd = np.asarray([data[0] for data in replay])
        bp = np.asarray([data[1] for data in replay])
        bloc = np.asarray([data[2] for data in replay])
        bs = np.asarray([data[3] for data in replay])
        ba = np.asarray([data[4] for data in replay])
        br = np.reshape(np.asarray([data[5] for data in replay]), [-1, 1])
        bs_ = np.asarray([data[6] for data in replay])
        bloc_ = np.asarray([data[7] for data in replay])
        bgm = np.zeros([BATCH_SIZE, self.m_dim, self.m_dim, 1])
        for batch in range(BATCH_SIZE):
            sd1 = bm_sd[batch]
            terrian_map = grid_map(sd1, self.m_dim, self.pixel_meter, bp[batch])
            bgm[batch, :, :, 0] = terrian_map.map_matrix
        blocm = np.zeros([BATCH_SIZE, self.att_dim*2+1, self.att_dim*2+1, 4])
        blocm_ = np.zeros([BATCH_SIZE, self.att_dim * 2 + 1, self.att_dim * 2 + 1, 4])
        for i in range(BATCH_SIZE):
            for j in range(self.att_dim*2+1):
                for k in range(self.att_dim*2+1):
                    blocm[i, j, k, :] = np.array([i, bloc[i, 0]-self.att_dim+j, bloc[i, 1]-self.att_dim+k, 0])
                    blocm_[i, j, k, :] = np.array([i, bloc_[i, 0] - self.att_dim + j, bloc_[i, 1] - self.att_dim + k, 0])

        self.sess.run(self.atrain, {self.S: bs, self.GM: bgm, self.LM: blocm})
        self.sess.run(self.ctrain, {self.GM: bgm, self.S: bs, self.LM: blocm, self.a: ba, self.R: br, self.S_: bs_, self.LM_: blocm_})

    def _build_a(self, s, gm, locm, reuse=None, custom_getter=None):

        def _conv2d_keep_size(x, y, kernel_size, name, use_bias=False, reuse_conv=None, trainable_conv=True):
            return tf.layers.conv2d(inputs=x,
                                    filters=y,
                                    kernel_size=kernel_size,
                                    padding="same",
                                    use_bias=use_bias,
                                    kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                    bias_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                    reuse=reuse_conv,
                                    name=name,
                                    trainable=trainable_conv)

        def _build_vin(mat, name, reuse, trainable_vin):
            h1 = _conv2d_keep_size(mat, 150, 3, name+"_h1", use_bias=True, reuse_conv=reuse, trainable_conv=trainable_vin)
            r = _conv2d_keep_size(h1, 1, 1, name+"_r", reuse_conv=reuse, trainable_conv=trainable_vin)
            q0 = _conv2d_keep_size(r, 10, 9, name+"_q0", reuse_conv=reuse, trainable_conv=trainable_vin)
            v = tf.reduce_max(q0, axis=3, keep_dims=True, name=name+"_v")
            rv = tf.concat([r, v], axis=3)
            q = _conv2d_keep_size(rv, 10, 9, name + "_q", reuse_conv=False, trainable_conv=trainable_vin)
            v = tf.reduce_max(q, axis=3, keep_dims=True, name=name + "_v")
            for k in range(30):
                rv = tf.concat([r, v], axis=3)
                q = _conv2d_keep_size(rv, 10, 9, name+"_q", reuse_conv=True, trainable_conv=trainable_vin)
                v = tf.reduce_max(q, axis=3, keep_dims=True, name=name+"_v")
            return v

        trainable = True if reuse is None else False
        with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
            gv = _build_vin(gm, name="global_map_vin", reuse=reuse, trainable_vin=trainable)
            att = tf.reshape(tf.gather_nd(gv, locm), [-1, (self.att_dim*2+1)**2])
            layer_1 = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
            layer_2a = tf.layers.dense(layer_1, 600, name='l2a', trainable=trainable)
            layer_2att = tf.layers.dense(att, 600, name='l2att', trainable=trainable)
            layer_2 = tf.add(layer_2a, layer_2att, name="l2")
            layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable)
            a = tf.layers.dense(layer_3, 7, activation=tf.nn.tanh, name='a1', trainable=trainable)
            return a

    def _build_c(self, s, gm, loc, a, reuse=None, custom_getter=None):
        trainable = True if reuse is None else False
        with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
            gm_flat = tf.reshape(gm, [-1, self.m_dim**2])
            layer_gm = tf.layers.dense(gm_flat, self.s_dim, activation=tf.nn.relu, name='lgm', trainable=trainable)
            s_all = tf.concat([layer_gm, s], axis=1)
            layer_1 = tf.layers.dense(s_all, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
            layer_2s = tf.layers.dense(layer_1, 600, activation=None, name='l2s', trainable=trainable)
            layer_2a = tf.layers.dense(a, 600, activation=None, name='l2a', trainable=trainable)
            layer_2 = tf.add(layer_2s, layer_2a, name="l2")
            layer_3 = tf.layers.dense(layer_2, 600, activation=tf.nn.relu, name='l3', trainable=trainable)
            return tf.layers.dense(layer_3, 1, trainable=trainable)  # Q(s,a)

    def save_network(self):
        self.saver = tf.train.Saver()
        print("save ddpg-network...", self.time_step)
        self.saver.save(self.sess, 'saved_ddpg_networks/' + "ddpg-network", global_step=self.time_step)

    def load_network(self):
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_ddpg_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
Exemple #13
0
class NeuralAgent():
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])

    def rollout(self, env):
        max_steps = 10000

        vision = False

        # zhichen: it is not stable to have two torcs env and UDP connections
        # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name)

        ob = env.reset(relaunch=True)
        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        sp = []

        lastLapTime = []

        for j_iter in range(max_steps):

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            a_t = a_t[0]
            # print('test a_t:', a_t)
            a_t[0] = clip(a_t[0], -1, 1)
            a_t[1] = clip(a_t[1], 0, 1)
            a_t[2] = clip(a_t[2], 0, 1)

            ob, r_t, done, info = env.step(a_t)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            if np.mod(j_iter + 1, 20) == 0:
                logging.info('step: ' + str(j_iter + 1))
                print('\n ob: ', ob)

            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t

            if done: break

        logging.info("Test Episode Reward: " + str(total_reward) +
                     " Episode Length: " + str(j_iter + 1) + " Ave Reward: " +
                     str(total_reward / (j_iter + 1)) + "\n Distance: " +
                     str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) +
        #            " Episode Length: " + str(j_iter+1) + "  Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime))

        #env.end()  # This is for shutting down TORCS

        ave_sp = np.mean(sp)
        max_sp = np.max(sp)
        min_sp = np.min(sp)

        return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime

    def update_neural(self,
                      controllers,
                      episode_count=200,
                      tree=False,
                      seed=1337):
        OU = FunctionOU()
        vision = False
        GAMMA = 0.99
        EXPLORE = 100000.
        max_steps = 10000
        reward = 0
        done = False
        step = 0
        epsilon = 1

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Experiment Start with Lambda = " +
                     str(self.lambda_mix))

        for i_episode in range(episode_count):
            logging.info("Episode : " + str(i_episode) + " Replay Buffer " +
                         str(self.buff.count()))
            if np.mod(i_episode, 3) == 0:
                logging.info('relaunch TORCS')
                ob = env.reset(
                    relaunch=True
                )  # relaunch TORCS every 3 episode because of the memory leak error
            else:
                logging.info('reset TORCS')
                ob = env.reset()

            #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)]
            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward = 0.
            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), [0, 0, 0]]
            window_list = [tempObs[:] for _ in range(window)]

            sp = []

            lastLapTime = []

            for j_iter in range(max_steps):
                if tree:
                    tree_obs = [
                        sensor for obs in tempObs[:-1] for sensor in obs
                    ]
                    act_tree = controllers.predict([tree_obs])
                    steer_action = clip_to_range(act_tree[0][0], -1, 1)
                    accel_action = clip_to_range(act_tree[0][1], 0, 1)
                    brake_action = clip_to_range(act_tree[0][2], 0, 1)
                else:
                    steer_action = clip_to_range(
                        steer_prog.pid_execute(window_list), -1, 1)
                    accel_action = clip_to_range(
                        accel_prog.pid_execute(window_list), 0, 1)
                    brake_action = clip_to_range(
                        brake_prog.pid_execute(window_list), 0, 1)
                action_prior = [steer_action, accel_action, brake_action]

                tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                           [ob.speedZ], [ob.rpm],
                           list(ob.wheelSpinVel / 100.0),
                           list(ob.track), action_prior]
                window_list.pop(0)
                window_list.append(tempObs[:])

                loss = 0
                epsilon -= 1.0 / EXPLORE
                a_t = np.zeros([1, self.action_dim])
                noise_t = np.zeros([1, self.action_dim])

                a_t_original = self.actor.model.predict(
                    s_t.reshape(1, s_t.shape[0]))
                noise_t[0][0] = max(epsilon, 0) * OU.function(
                    a_t_original[0][0], 0.0, 0.60, 0.30)
                noise_t[0][1] = max(epsilon, 0) * OU.function(
                    a_t_original[0][1], 0.5, 1.00, 0.10)
                noise_t[0][2] = max(epsilon, 0) * OU.function(
                    a_t_original[0][2], 0, 1.00, 0.05)

                a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
                a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
                a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

                mixed_act = [
                    a_t[0][k_iter] / (1 + self.lambda_mix) +
                    (self.lambda_mix /
                     (1 + self.lambda_mix)) * action_prior[k_iter]
                    for k_iter in range(3)
                ]

                ob, r_t, done, info = env.step(mixed_act)

                sp.append(info['speed'])

                if lastLapTime == []:
                    if info['lastLapTime'] > 0:
                        lastLapTime.append(info['lastLapTime'])
                elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                        'lastLapTime']:
                    lastLapTime.append(info['lastLapTime'])

                s_t1 = np.hstack(
                    (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                     ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

                self.buff.add(s_t, a_t[0], r_t, s_t1,
                              done)  # Add replay buffer

                # Do the batch update
                batch = self.buff.getBatch(self.batch_size)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.zeros((states.shape[0], 1))

                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                loss += self.critic.model.train_on_batch([states, actions],
                                                         y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads)
                self.actor.target_train()
                self.critic.target_train()

                total_reward += r_t
                s_t = s_t1

                # Control prior mixing term
                if j_iter > 0 and i_episode > 50:
                    lambda_track = lambda_max * (1 - np.exp(-factor * np.abs(
                        r_t +
                        GAMMA * np.mean(target_q_values[-1] - base_q[-1]))))
                    lambda_track = np.squeeze(lambda_track)
                else:
                    lambda_track = 10.
                lambda_store[j_iter] = lambda_track
                base_q = copy.deepcopy(target_q_values)

                if np.mod(step, 2000) == 0:
                    logging.info("Episode " + str(i_episode) + " Distance " +
                                 str(ob.distRaced) + " Lap Times " +
                                 str(ob.lastLapTime))

                step += 1
                if done:
                    break

            #else:
            #    env.end()

            self.lambda_mix = np.mean(lambda_store)

            logging.info('Episode ends! \n' + "Total Steps: " + str(step) +
                         " " + str(i_episode) + "-th Episode Reward: " +
                         str(total_reward) + " Episode Length: " +
                         str(j_iter + 1) + " Ave Reward: " +
                         str(total_reward / (j_iter + 1)) + "\n Distance: " +
                         str(info['distRaced']) + ' ' +
                         str(info['distFromStart']) + "\n Last Lap Times: " +
                         str(info['lastLapTime']) + " Cur Lap Times: " +
                         str(info['curLapTime']) + " lastLaptime: " +
                         str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                         " max sp: " + str(np.max(sp)))

            #logging.info(" Lambda Mix: " + str(self.lambda_mix))

            self.save['total_reward'].append(total_reward)
            self.save['total_step'].append(j_iter + 1)
            self.save['ave_reward'].append(total_reward / (j_iter + 1))

            self.save['distRaced'].append(info['distRaced'])
            self.save['distFromStart'].append(info['distFromStart'])

            self.save['lastLapTime'].append(info['lastLapTime'])
            self.save['curLapTime'].append(info['curLapTime'])
            self.save['lapTimes'].append(lastLapTime)
            if lastLapTime == []:
                self.save['avelapTime'].append(0)
            else:
                self.save['avelapTime'].append(np.mean(lastLapTime))

            self.save['ave_sp'].append(np.mean(sp))
            self.save['max_sp'].append(np.max(sp))
            self.save['min_sp'].append(np.min(sp))

            # test
            if np.mod(i_episode + 1, 10) == 0:
                logging.info("Start Testing!")
                test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout(
                    env)
                self.save['test_total_reward'].append(test_total_reward)
                self.save['test_total_step'].append(test_step)
                self.save['test_ave_reward'].append(test_total_reward /
                                                    test_step)

                self.save['test_distRaced'].append(test_info['distRaced'])
                self.save['test_distFromStart'].append(
                    test_info['distFromStart'])

                self.save['test_lastLapTime'].append(test_info['lastLapTime'])
                self.save['test_curLapTime'].append(test_info['curLapTime'])
                self.save['test_lapTimes'].append(test_lastLapTime)

                if test_lastLapTime == []:
                    self.save['test_avelapTime'].append(0)
                else:
                    self.save['test_avelapTime'].append(
                        np.mean(test_lastLapTime))

                self.save['test_ave_sp'].append(test_ave_sp)
                self.save['test_max_sp'].append(test_max_sp)
                self.save['test_min_sp'].append(test_min_sp)

            if np.mod(i_episode + 1, 5) == 0:
                print("Now we save model")
                #os.remove("actormodel.h5")
                self.actor.model.save_weights("actormodel_" + str(seed) +
                                              ".h5",
                                              overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)

                #os.remove("criticmodel.h5")
                self.critic.model.save_weights("criticmodel_" + str(seed) +
                                               ".h5",
                                               overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)

                filename = "./model/actormodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.actor.model.save_weights(filename, overwrite=True)
                filename = "./model/criticmodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.critic.model.save_weights(filename, overwrite=True)

            if np.mod(i_episode + 1, 10) == 0:
                filename = "./Fig/iprl_save_" + str(seed)
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(filename, 'wb') as f:
                    pickle.dump(self.save, f)

            if i_episode > 1000 and all(
                    np.array(self.save['total_reward'][-20:]) < 20):
                print('model degenerated. Stop at Epsisode ' + str(i_episode))
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Neural Policy Update Finish.")
        return None

    def collect_data(self, controllers, tree=False):

        vision = False

        max_steps = 10000

        step = 0

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)
        ob = env.reset(relaunch=True)
        print("S0=", ob)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Collection started with Lambda = " +
                     str(self.lambda_mix))

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        window_list = [tempObs[:] for _ in range(window)]

        observation_list = []
        actions_list = []

        lastLapTime = []
        sp = []

        for j_iter in range(max_steps):
            if tree:
                tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs]
                act_tree = controllers.predict([tree_obs])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)

            action_prior = [steer_action, accel_action, brake_action]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            window_list.pop(0)
            window_list.append(tempObs[:])

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]
            if tree:
                newobs = [item for sublist in tempObs[:-1] for item in sublist]
                observation_list.append(newobs[:])
            else:
                observation_list.append(window_list[:])
            actions_list.append(mixed_act[:])
            ob, r_t, done, info = env.step(mixed_act)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t
            s_t = s_t1
            #if np.mod(step, 2000) == 0:
            #    logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime))

            step += 1
            if done:
                break

        logging.info("Data Collection Finished!")
        logging.info('Episode ends! \n' + "Episode Reward: " +
                     str(total_reward) + " Episode Length: " +
                     str(j_iter + 1) + " Ave Reward: " + str(total_reward /
                                                             (j_iter + 1)) +
                     "\n Distance: " + str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        env.end()

        return observation_list, actions_list

    def label_data(self, controllers, observation_list, tree=False):
        if not tree:
            steer_prog, accel_prog, brake_prog = controllers
        actions_list = []
        net_obs_list = []
        logging.info("Data labelling started with Lambda = " +
                     str(self.lambda_mix))
        for window_list in observation_list:
            if tree:
                act_tree = controllers.predict([window_list])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
                net_obs_list.append(window_list)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)
                net_obs = [sensor for obs in window_list[-1] for sensor in obs]
                net_obs_list.append(net_obs[:29])

            action_prior = [steer_action, accel_action, brake_action]

            s_t = np.hstack([[net_obs[:29]]])
            a_t = self.actor.model.predict(s_t.reshape(1, 29))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]

            actions_list.append(mixed_act[:])

        return net_obs_list, observation_list, actions_list
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])
        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def action(self, state):
        action = self.actor_network.action(state)

        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        return self.time_step
Exemple #15
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")

    def train(self):
        # my_config.logger.debug("......enter tain......")
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise(action)
        # if random.random() <= 0.5:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5])
        # else:
        #     noise = self.exploration_noise.noise(action,
        #         mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75])
        noise_action = action + noise
        clipped_noise_action = np.clip(noise_action, 0, 1)
        # if (self.time_step < 5):
        #     my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action))
        return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        # my_config.logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'ltr')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemple #16
0
class DeepQLearner(object):
    def __init__(self, session,
                       optimizer,
                       q_network,
                       state_dim,
                       num_actions,
                       batch_size=32,
                       init_exp=0.5,       # initial exploration prob
                       final_exp=0.1,      # final exploration prob
                       anneal_steps=10000, # N steps for annealing exploration 
                       replay_buffer_size=10000,
                       store_replay_every=5, # how frequent to store experience
                       discount_factor=0.9, # discount future rewards
                       target_update_rate=0.01,
                       name="DeepQLearner"
                       ):
        """ Initializes the Deep Q Network.

            Args:
                session: A TensorFlow session.
                optimizer: A TensorFlow optimizer.
                q_network: A TensorFlow network that takes in a state and output the Q-values over
                           all actions. 
                state_dim: Dimension of states.
                num_actions: Number of actions.
                batch_size: Batch size for training with experience replay.
                init_exp: Initial exploration probability for eps-greedy policy.
                final_exp: Final exploration probability for eps-greedy policy.
                anneal_steps: Number of steps to anneal from init_exp to final_exp.
                replay_buffer_size: Size of replay buffer.
                store_replay_every: Frequency with which to store replay.
                discount_factor: For discounting future rewards.
                target_update_rate: For the slow update of the target network.
                name: Used to create a variable scope. Useful for creating multiple
                      networks.
        """
        self.session = session
        self.optimizer = optimizer
        self.q_network = q_network # tensorflow constructor for Q network
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.batch_size = batch_size

        # initialize exploration
        self.exploration = init_exp
        self.init_exp = init_exp
        self.final_exp = final_exp
        self.anneal_steps = anneal_steps

        self.discount_factor = discount_factor
        self.target_update_rate = target_update_rate

        # Initialize the replay buffer.
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.store_replay_every = store_replay_every
        self.experience_cnt = 0

        self.name = name

        self.train_iteration = 0
        self.constructModel()
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

    def constructModel(self):
        """ Constructs the model to do Q-learning.
        """

        # ensure that we don't have conflicts when initializing multiple models
        with tf.variable_scope(self.name):
            # this part of the model is for predicting actions using the learned Q_network.
            with tf.name_scope("predict_actions"):

                # input: vectors of states (in a batch)
                self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")

                # use new scope to differentiate this q_network from one used for target evaluation
                # note that this will differentiate the weights, for example "learn_q_network/W1"
                with tf.variable_scope("learn_q_network"):
                    # the current q_network that we train
                    self.action_scores = self.q_network(self.states, self.state_dim, self.num_actions)
                self.predicted_actions = tf.argmax(self.action_scores, axis=1, name="predicted_actions")

            # this part of the model is for estimating future rewards, to be used for the Q-learning
            # update for estimating the target Q-value.
            with tf.name_scope("estimate_future_rewards"):

                # input: vectors of next states (in a batch)
                self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")

                # input: binary inputs that indicate whether states are unfinished or terminal
                # this is important to compute the target and do the Bellman update correctly, since
                # it tells us whether to include the optimal Q value for the next state or not.
                self.unfinished_states_flags = tf.placeholder(tf.float32, (None,), name="unfinished_states_flags")

                # input: rewards from last state and action
                self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")

                # use new scope to differentiate this q_network from one we are training
                # note that this will differentiate the weights, for example "target_q_network/W1"
                with tf.variable_scope("target_q_network"):
                    # the q_network used for evaluation
                    self.eval_q_vals = self.q_network(self.next_states, self.state_dim, self.num_actions)

                # note that this term is only non-zero for a state if it is non-terminal
                # also note the use of stop_gradient to make sure we don't train this q_network
                self.best_future_q_vals = tf.reduce_max(tf.stop_gradient(self.eval_q_vals), axis=1) * self.unfinished_states_flags

                # future rewards given by Bellman equation
                self.future_rewards = self.rewards + self.discount_factor * self.best_future_q_vals

            # this part of the model is for computing the loss and gradients
            with tf.name_scope("loss"):
                # input: one-hot vectors that give the current actions to evaluate the loss for
                self.action_selects = tf.placeholder(tf.float32, (None, self.num_actions), name="action_select")

                # get Q-values for the actions that we took
                self.selected_action_scores = tf.reduce_sum(self.action_scores * self.action_selects, axis=1)

                # temporal difference loss
                self.td_loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.future_rewards - self.selected_action_scores)))

                # cross-entropy loss for adversarial example generation
                self.cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.action_scores, self.action_selects))

                # TODO: regularization loss

                # TODO: gradient clipping

                self.train_op = self.optimizer.minimize(self.td_loss)

            # this part of the model is for updating the target Q network
            with tf.name_scope("eval_q_network_update"):
                target_network_update = []
                # slowly update target network parameters with Q network parameters
                # we do this by grabbing all the parameters in both networks and manually defining
                # update operations
                self.q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="learn_q_network")
                self.target_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_q_network")
                for v_source, v_target in zip(self.q_network_variables, self.target_network_variables):
                    # this is equivalent to target = (1-alpha) * target + alpha * source
                    update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
                    target_network_update.append(update_op)
                # this groups all operations to run together
                # this operation will update all of the target Q network variables
                self.target_network_update = tf.group(*target_network_update)

    def store_experience(self, state, action, reward, next_state, done):
        """ 
        Adds an experience to the replay buffer.
        """
        if self.experience_cnt % self.store_replay_every == 0 or done:
            self.replay_buffer.add(state, action, reward, next_state, done)
        self.experience_cnt += 1

    def greedy_policy(self, states):
        """ 
        Executes the greedy policy. Useful for executing a learned agent.
        """
        return self.session.run(self.predicted_actions, {self.states: states})[0]


    def e_greedy_policy(self, states):
        """ 
        Executes the epsilon greedy policy. 
        """
        # with probability exploration, choose random action
        if random.random() < self.exploration:
            return random.randint(0, self.num_actions-1)
        # choose greedy action given by current Q network
        else:
            return self.greedy_policy(states)


    def annealExploration(self):
        """ 
        Anneals the exploration probability linearly with training iteration.
        """
        ratio = max((self.anneal_steps - self.train_iteration) / float(self.anneal_steps), 0)
        self.exploration = (self.init_exp- self.final_exp) * ratio + self.final_exp

    def updateModel(self):
        """ 
        Update the model by sampling a batch from the replay buffer and
        performing Q-learning updates on the network parameters.
        """

        # not enough experiences yet
        if self.replay_buffer.count() < self.batch_size:
            return

        # sample a random batch from the replay buffer
        batch = self.replay_buffer.getBatch(self.batch_size)

        # keep track of these inputs to the Q networks for the batch
        states                     = np.zeros((self.batch_size, self.state_dim))
        rewards                    = np.zeros((self.batch_size,))
        action_selects             = np.zeros((self.batch_size, self.num_actions))
        next_states                = np.zeros((self.batch_size, self.state_dim))
        unfinished_states_flags    = np.zeros((self.batch_size,))

        # train on the experiences in this batch
        for k, (s0, a, r, s1, done) in enumerate(batch):
            states[k] = s0
            rewards[k] = r
            action_selects[k][a] = 1
            # check terminal state
            if not done:
                next_states[k] = s1
                unfinished_states_flags[k] = 1

        # perform one update of training
        cost, _ = self.session.run([self.td_loss, self.train_op], {
          self.states : states,
          self.next_states : next_states,
          self.unfinished_states_flags : unfinished_states_flags,
          self.action_selects : action_selects,
          self.rewards : rewards
        })

        # update target network using learned Q-network
        self.session.run(self.target_network_update)

        self.annealExploration()
        self.train_iteration += 1

    # saves the trained model
    def saveModel(self, name):
        self.saver.save(self.session, name)

    def restoreModel(self, name):
        self.saver.restore(self.session, './' + name)

    def reset(self):
        # initialize exploration
        self.exploration = self.init_exp

        # Initialize the replay buffer.
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.experience_cnt = 0

        self.train_iteration = 0
        self.session.run(tf.global_variables_initializer())
Exemple #17
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        # print(minibatch)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        # print(q_value_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("action_batch[0]", file=f)
            print(action_batch[0], file=f)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)
        with open('/home/ruizhao/Desktop/a.txt', 'a') as f:
            print("q_gradient_batch[0]", file=f)
            print(q_gradient_batch[0], file=f)
        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action2(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        random_action = np.zeros(10, float)
        random_action[random.randint(0, 3)] = 1
        random_action[4] = random.uniform(-100, 100)  #DASH POWER
        random_action[5] = random.uniform(-180, 180)  #DASH DEGREES
        random_action[6] = random.uniform(-180, 180)  #TURN DEGREES
        random_action[7] = random.uniform(-180, 180)  #TACKLE DEGREES
        random_action[8] = random.uniform(0, 100)  #KICK POWER
        random_action[9] = random.uniform(-180, 180)  #KICK DEGREES
        if np.random.uniform() < EPSILON:
            return action
        else:
            return random_action

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #18
0
class MaDDPG:
    def __init__(self, num_agents, state_dim, action_dim):
        # track training times
        self.time_step = 0
        # use set session use GPU
        #self.sess = tf.InteractiveSession()
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=True))
        self.num_agents = num_agents
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents = self.create_multi_agents(self.sess, num_agents,
                                               self.state_dim, self.action_dim)
        # make sure create Criticnetwork later, summarise mean Q value inside
        self.critic = CriticNetwork(self.sess, state_dim, action_dim)
        self.exploration_noise = OUNoise((self.num_agents, action_dim))
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        # for store checkpoint
        self.saver = tf.train.Saver()

    def train(self):
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim))
        action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim))
        reward_batch = np.zeros((BATCH_SIZE, self.num_agents))
        next_state_batch = np.zeros(
            (BATCH_SIZE, self.num_agents, self.state_dim))
        done_batch = np.zeros((BATCH_SIZE))
        for ii in range(BATCH_SIZE):
            state_batch[ii, :, :] = minibatch[ii][0]
            action_batch[ii, :, :] = minibatch[ii][1]
            reward_batch[ii, :] = minibatch[ii][2]
            next_state_batch[ii, :, :] = minibatch[ii][3]
            done_batch[ii] = minibatch[ii][4]

        # calculate Gt batch
        next_action_batch = self.target_actions(next_state_batch)
        q_value_batch = self.critic.target_q(next_state_batch,
                                             next_action_batch)
        gt = np.zeros((BATCH_SIZE, self.num_agents))
        for ii in range(BATCH_SIZE):
            if done_batch[ii]:
                gt[ii, :] = reward_batch[ii, :]
            else:
                gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :]
        #update critic by minimizing the loss
        self.critic.train(gt, state_batch, action_batch)

        # update policy using the sampling gradients
        actions_for_grad = self.actions(state_batch)
        q_gradients_batch = self.critic.gradients(state_batch,
                                                  actions_for_grad)
        self.train_agents(q_gradients_batch, state_batch)

        # update critic target network
        self.critic.update_target()

        # update actor target
        self.update_agents_target()

    def summary(self, record_num):
        if self.replay_buffer.count() > SUMMARY_BATCH_SIZE:
            mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE)
            state_batch = np.zeros(
                (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim))
            for ii in range(SUMMARY_BATCH_SIZE):
                state_batch[ii, :, :] = mini_batch[ii][0]

            actions_for_summary = self.actions(state_batch)
            self.critic.write_summaries(state_batch, actions_for_summary,
                                        record_num)

    def update_agents_target(self):
        for agent in self.agents:
            agent.update_target()

    def train_agents(self, gradients_batch, state_batch):
        # gradients_batch = [batchsize* agents* action_dim]
        # state_batch = [batchsize* agents * state_dim ]
        for ii in range(self.num_agents):
            grad = gradients_batch[:, ii, :]
            state = state_batch[:, ii, :]
            self.agents[ii].train(grad, state)

    def create_multi_agents(self, sess, num_agents, state_dim, action_dim):
        agents = []
        nets = None
        for ii in range(num_agents):
            agent_name = 'agent' + str(ii)
            agents.append(
                ActorNetwork(sess, state_dim, action_dim, agent_name, nets))
            nets = agents[-1].nets
        return agents

    def add_agents(self, add_num):
        for ii in range(add_num):
            #self.num_agents+=1

            agent_name = 'agent' + str(self.num_agents)
            self.agents.append(
                ActorNetwork(self.sess, self.state_dim, self.action_dim,
                             agent_name, self.agents[-1].nets))
            # the agents' name is from 0-num_agents-1
            self.num_agents += 1

        # if add a new agent then reset the noise and replay buffer
        self.exploration_noise = OUNoise((self.num_agents, self.action_dim))
        #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.replay_buffer.erase()
        # re-create a saver
        # the new saver will contains all the savable variables.
        # otherwise only contains the initially created agents
        self.saver = tf.train.Saver()
        # reset the time step
        # self.time_step = 0

    def action(
        self, state
    ):  # here is action, for one state on agent, not batch_sized actions
        # state = [num_agents * state_dim]
        # actions = [num_agents *  action_dim]
        action = np.zeros((self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            action[ii, :] = self.agents[ii].action(state[ii, :])
        return action

    def actions(self, state_batch):
        #state = batch_size*numOfagents*state_dim
        #actions = batch_size*numOfagents*action_dim
        batch_size = state_batch.shape[0]
        actions = np.zeros((batch_size, self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :])
        return actions

    def target_actions(self, state_batch):
        # the state size  is batch_size* num_agents * state_dimension
        actions = np.zeros(
            (state_batch.shape[0], self.num_agents, self.action_dim))
        for ii in range(self.num_agents):
            actions[:,
                    ii, :] = self.agents[ii].target_actions(state_batch[:,
                                                                        ii, :])
        return actions

    def noise_action(self, state):
        action = self.action(state)
        # clip the action, action \in [-1,+1]
        return np.clip(action + self.exploration_noise.noise(), -1, 1)

    def close_session(self):
        self.sess.close()

    def perceive(self, state, action, reward, next_state, done):
        # store {st,at,Rt+1,st+1}
        self.replay_buffer.add(state, action, reward, next_state, done)

        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()
            if self.time_step % SAVE_STEPS == 0:
                self.save_network()
            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

            # Re-initialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state("saved_network")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print('Could not find old network weights')

    def save_network(self):
        # do not processing under Dropbox
        #  exit drop box then run
        print('save network...', self.time_step)
        self.saver.save(self.sess,
                        'saved_network/' + 'network',
                        global_step=self.time_step)
Exemple #19
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #20
0
def run_ddpg(amodel,
             cmodel,
             train_indicator=0,
             seeded=1337,
             track_name='practgt2.xml'):
    OU = FunctionOU()
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic
    ALPHA = 0.9

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(seeded)

    vision = False

    EXPLORE = 100000.
    if train_indicator:
        episode_count = 600
    else:
        episode_count = 3
    max_steps = 20000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    if not train_indicator:
        # Now load the weight
        #logging.info("Now we load the weight")
        print("Now we load the weight")
        try:
            actor.model.load_weights(amodel)
            critic.model.load_weights(cmodel)
            actor.target_model.load_weights(amodel)
            critic.target_model.load_weights(cmodel)
            #logging.info(" Weight load successfully")
            print("Weight load successfully")
        except:
            #ogging.info("Cannot find the weight")
            print("Cannot find the weight")
            exit()

    #logging.info("TORCS Experiment Start.")
    print("TORCS Experiment Start.")
    best_lap = 500

    for i_episode in range(episode_count):
        print("Episode : " + str(i_episode) + " Replay Buffer " +
              str(buff.count()))
        #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count()))
        if np.mod(i_episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        for j_iter in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i_episode, "Step", step, "Action", a_t, "Reward",
                  r_t, "Loss", loss)

            if np.mod(step, 1000) == 0:
                logging.info("Episode {}, Distance {}, Last Lap {}".format(
                    i_episode, ob.distRaced, ob.lastLapTime))
                if ob.lastLapTime > 0:
                    if best_lap < ob.lastLapTime:
                        best_lap = ob.lastLapTime

            step += 1
            if done:
                break

        if train_indicator and i_episode > 20:
            if np.mod(i_episode, 3) == 0:
                logging.info("Now we save model")
                actor.model.save_weights("ddpg_actor_weights_periodic.h5",
                                         overwrite=True)
                critic.model.save_weights("ddpg_critic_weights_periodic.h5",
                                          overwrite=True)

        print("TOTAL REWARD @ " + str(i_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Best Lap {}".format(best_lap))
        print("")
        logging.info("TOTAL REWARD @ " + str(i_episode) +
                     "-th Episode  : Reward " + str(total_reward))
        logging.info("Best Lap {}".format(best_lap))
    env.end()  # This is for shutting down TORCS
    logging.info("Finish.")
Exemple #21
0
class DDPG:
    def __init__(self, env):
        self.name = 'DDPG'
        self.environment = env
        self.episode = 0
        self.epsilon = 0.98
        self.one_number = 1
        self.mean = []
        self.state_dim = len(obs2state(env.reset().observation))
        self.action_dim = env.action_spec().shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        self.critic_network.train(y_batch, state_batch, action_batch)
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        action = self.actor_network.action(state)
        exp = self.exploration_noise.noise()
        t = action * exp
        return exp

    def action(self, state):
        if np.random.rand() <= self.epsilon:
            act = self.noise_action(state)
            z = array(act)
        else:
            action = self.actor_network.action(state)
            z = array(action)
        self.mean.append(z[0])
        g = np.tanh(z)
        return g

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
        if self.epsilon > 0.1:
            self.epsilon *= 0.99999

        if done:
            self.exploration_noise.reset()
Exemple #22
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        mx.random.seed(seed)
        np.random.seed(seed)
        self.env = env
        if flg_gpu:
            self.ctx = mx.gpu(0)
        else:
            self.ctx = mx.cpu()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.ddpgnet = DDPGNet(self.state_dim, self.action_dim)
        self.exploration_noise = OUNoise(self.action_dim)
        self.replay_buffer = ReplayBuffer(memory_size)

        self.batch_size = batch_size

        self.ddpgnet.init()
        self.train_step = 0

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(self.batch_size)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,
                                 [self.batch_size, self.action_dim])

        # Calculate y_batch
        next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy()

        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0])
        y_batch = np.resize(y_batch, [self.batch_size, 1])

        # Update critic by minimizing the loss L
        self.ddpgnet.update_critic(state_batch, action_batch, y_batch)

        # Update actor by maxmizing Q
        self.ddpgnet.update_actor(state_batch)

        self.train_step += 1
        # update target networks
        self.ddpgnet.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        state = np.reshape(state, (1, self.state_dim))
        action = self.ddpgnet.get_step_action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > memory_start_size:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #23
0
class DDPG(object):
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0

    def train(self):
        self.time_step = self.time_step + 1
        self.epsilon_expert -= (self.epsilon_expert_range[0] -
                                self.epsilon_expert_range[1]) / EXPLORE_COUNT
        self.epsilon_expert = max(self.epsilon_expert,
                                  self.epsilon_expert_range[1])
        self.epsilon_random -= (self.epsilon_random_range[0] -
                                self.epsilon_random_range[1]) / EXPLORE_COUNT
        self.epsilon_random = max(self.epsilon_random,
                                  self.epsilon_random_range[1])
        logger.debug(
            "step: %d, epsilon_expert: %s, epsilon_random: %s" %
            (self.time_step, self.epsilon_expert, self.epsilon_random))
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
            # if done_batch[i]:
            #     y_batch.append(reward_batch[i])
            # else :
            #     y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_cost = self.critic_network.train(y_batch, state_batch,
                                                     action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = self.exploration_noise.noise(action)
    #     noise_action = action + noise
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    # def noise_action(self,state):
    #     # Select action a_t according to the current policy and exploration noise
    #     action = self.actor_network.action(state)
    #     noise = np.zeros(self.action_dim)
    #     noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10)
    #     noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
    #     noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10)
    #     noise_action = action + noise
    #     logger.debug("action: %s, noise: %s" % (action, noise))
    #     clipped_noise_action = np.clip(noise_action, 0, 1)
    #     return clipped_noise_action

    def action(self, state):
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        return action

    def opposite_action(self, state):
        logger.debug("state: %s" % (state))
        action = self.actor_network.action(state)
        logger.debug("action: %s" % (action))
        action[0] = 1 - action[0]
        logger.debug("opposite action: %s" % (action))
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # self.time_step = self.time_step + 1

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >= REPLAY_START_SIZE:
            # logger.debug("train...")
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        # if done:
        #     self.exploration_noise.reset()

    def saveNetwork(self):
        logger.warn("time step: %s, save model" % (self.time_step))
        ckpt_file = os.path.join(MODEL_PATH, 'DDPG')
        self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
Exemple #24
0
class DeepDeterministicPolicyGradient(object):

  def __init__(self, session,
                     optimizer,
                     actor_network,
                     critic_network,
                     state_dim,
                     action_dim,
                     batch_size=32,
                     replay_buffer_size=1000000, # size of replay buffer
                     store_replay_every=1,       # how frequent to store experience
                     discount_factor=0.99,       # discount future rewards
                     target_update_rate=0.01,
                     reg_param=0.01,             # regularization constants
                     max_gradient=5,             # max gradient norms
                     noise_sigma=0.20,
                     noise_theta=0.15,
                     summary_writer=None,
                     summary_every=100):

    # tensorflow machinery
    self.session        = session
    self.optimizer      = optimizer
    self.summary_writer = summary_writer

    # model components
    self.actor_network  = actor_network
    self.critic_network = critic_network
    self.replay_buffer  = ReplayBuffer(buffer_size=replay_buffer_size)

    # training parameters
    self.batch_size         = batch_size
    self.state_dim          = state_dim
    self.action_dim         = action_dim
    self.discount_factor    = discount_factor
    self.target_update_rate = target_update_rate
    self.max_gradient       = max_gradient
    self.reg_param          = reg_param

    # Ornstein-Uhlenbeck noise for exploration
    self.noise_var = tf.Variable(tf.zeros([1, action_dim]))
    noise_random = tf.random_normal([1, action_dim], stddev=noise_sigma)
    self.noise = self.noise_var.assign_sub((noise_theta) * self.noise_var - noise_random)

    # counters
    self.store_replay_every   = store_replay_every
    self.store_experience_cnt = 0
    self.train_iteration      = 0

    # create and initialize variables
    self.create_variables()
    var_lists = tf.get_collection(tf.GraphKeys.VARIABLES)
    self.session.run(tf.initialize_variables(var_lists))

    # make sure all variables are initialized
    self.session.run(tf.assert_variables_initialized())

    if self.summary_writer is not None:
      # graph was not available when journalist was created
      self.summary_writer.add_graph(self.session.graph)
      self.summary_every = summary_every

  def create_variables(self):
    
    with tf.name_scope("model_inputs"):
      # raw state representation
      self.states = tf.placeholder(tf.float32, (None, self.state_dim), name="states")
      # action input used by critic network
      self.action = tf.placeholder(tf.float32, (None, self.action_dim), name="action")

    # define outputs from the actor and the critic
    with tf.name_scope("predict_actions"):
      # initialize actor-critic network
      with tf.variable_scope("actor_network"):
        self.policy_outputs = self.actor_network(self.states)
      with tf.variable_scope("critic_network"):
        self.value_outputs    = self.critic_network(self.states, self.action)
        self.action_gradients = tf.gradients(self.value_outputs, self.action)[0]

      # predict actions from policy network
      self.predicted_actions = tf.identity(self.policy_outputs, name="predicted_actions")
      tf.histogram_summary("predicted_actions", self.predicted_actions)
      tf.histogram_summary("action_scores", self.value_outputs)

    # get variable list
    actor_network_variables  = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network")
    critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network")

    # estimate rewards using the next state: r + argmax_a Q'(s_{t+1}, u'(a))
    with tf.name_scope("estimate_future_rewards"):
      self.next_states = tf.placeholder(tf.float32, (None, self.state_dim), name="next_states")
      self.next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks")
      self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")

      # initialize target network
      with tf.variable_scope("target_actor_network"):
        self.target_actor_outputs = self.actor_network(self.next_states)
      with tf.variable_scope("target_critic_network"):
        self.target_critic_outputs = self.critic_network(self.next_states, self.target_actor_outputs)

      # compute future rewards
      self.next_action_scores = tf.stop_gradient(self.target_critic_outputs)[:,0] * self.next_state_mask
      tf.histogram_summary("next_action_scores", self.next_action_scores)
      self.future_rewards = self.rewards + self.discount_factor * self.next_action_scores

    # compute loss and gradients
    with tf.name_scope("compute_pg_gradients"):

      # compute gradients for critic network
      self.temp_diff        = self.value_outputs[:,0] - self.future_rewards
      self.mean_square_loss = tf.reduce_mean(tf.square(self.temp_diff))
      self.critic_reg_loss  = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in critic_network_variables])
      self.critic_loss      = self.mean_square_loss + self.reg_param * self.critic_reg_loss
      self.critic_gradients = self.optimizer.compute_gradients(self.critic_loss, critic_network_variables)

      # compute actor gradients (we don't do weight decay for actor network)
      self.q_action_grad = tf.placeholder(tf.float32, (None, self.action_dim), name="q_action_grad")
      actor_policy_gradients = tf.gradients(self.policy_outputs, actor_network_variables, -self.q_action_grad)
      self.actor_gradients = zip(actor_policy_gradients, actor_network_variables)

      # collect all gradients
      self.gradients = self.actor_gradients + self.critic_gradients

      # clip gradients
      for i, (grad, var) in enumerate(self.gradients):
        # clip gradients by norm
        if grad is not None:
          self.gradients[i] = (tf.clip_by_norm(grad, self.max_gradient), var)

      # summarize gradients
      for grad, var in self.gradients:
        tf.histogram_summary(var.name, var)
        if grad is not None:
          tf.histogram_summary(var.name + '/gradients', grad)

      # emit summaries
      tf.scalar_summary("critic_loss", self.critic_loss)
      tf.scalar_summary("critic_td_loss", self.mean_square_loss)
      tf.scalar_summary("critic_reg_loss", self.critic_reg_loss)

      # apply gradients to update actor network
      self.train_op = self.optimizer.apply_gradients(self.gradients)

    # update target network with Q network
    with tf.name_scope("update_target_network"):
      self.target_network_update = []

      # slowly update target network parameters with the actor network parameters
      actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="actor_network")
      target_actor_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_actor_network")
      for v_source, v_target in zip(actor_network_variables, target_actor_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)

      # same for the critic network
      critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="critic_network")
      target_critic_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_critic_network")
      for v_source, v_target in zip(critic_network_variables, target_critic_network_variables):
        # this is equivalent to target = (1-alpha) * target + alpha * source
        update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source))
        self.target_network_update.append(update_op)

      # group all assignment operations together
      self.target_network_update = tf.group(*self.target_network_update)

    self.summarize = tf.merge_all_summaries()
    self.no_op = tf.no_op()

  def sampleAction(self, states, exploration=True):
    policy_outs, ou_noise = self.session.run([
      self.policy_outputs,
      self.noise
    ], {
      self.states: states
    })
    # add OU noise for exploration
    policy_outs = policy_outs + ou_noise if exploration else policy_outs
    return policy_outs

  def updateModel(self):

    # not enough experiences yet
    if self.replay_buffer.count() < self.batch_size:
      return

    batch           = self.replay_buffer.getBatch(self.batch_size)
    states          = np.zeros((self.batch_size, self.state_dim))
    rewards         = np.zeros((self.batch_size,))
    actions         = np.zeros((self.batch_size, self.action_dim))
    next_states     = np.zeros((self.batch_size, self.state_dim))
    next_state_mask = np.zeros((self.batch_size,))

    for k, (s0, a, r, s1, done) in enumerate(batch):
      states[k]  = s0
      rewards[k] = r
      actions[k] = a
      if not done:
        next_states[k] = s1
        next_state_mask[k] = 1

    # whether to calculate summaries
    calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

    # compute a = u(s)
    policy_outs = self.session.run(self.policy_outputs, {
      self.states: states
    })

    # compute d_a Q(s,a) where s=s_i, a=u(s)
    action_grads = self.session.run(self.action_gradients, {
      self.states: states,
      self.action: policy_outs
    })

    critic_loss, _, summary_str = self.session.run([
      self.critic_loss,
      self.train_op,
      self.summarize if calculate_summaries else self.no_op
    ], {
      self.states:          states,
      self.next_states:     next_states,
      self.next_state_mask: next_state_mask,
      self.action:          actions,
      self.rewards:         rewards,
      self.q_action_grad:   action_grads
    })

    # update target network using Q-network
    self.session.run(self.target_network_update)

    # emit summaries
    if calculate_summaries:
      self.summary_writer.add_summary(summary_str, self.train_iteration)

    self.train_iteration += 1

  def storeExperience(self, state, action, reward, next_state, done):
    # always store end states
    if self.store_experience_cnt % self.store_replay_every == 0 or done:
      self.replay_buffer.add(state, action, reward, next_state, done)
    self.store_experience_cnt += 1
class Worker:
    """docstring for DDPG"""
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')

    def start(self, setting=0):
        self.env = RunEnv(visualize=True)
        self.setting = setting

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(
            self.sess, next_state_batch)
        q_value_batch = self.critic_network.target_q(self.sess,
                                                     next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(
            self.sess, selfstate_batch)
        q_gradient_batch = self.critic_network.gradients(
            self.sess, state_batch, action_batch_for_gradients)

        self.actor_network.train(self.sess, q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target(self.sess)
        self.critic_network.update_target(self.sess)

    def save_model(self, saver, episode):
        #if self.episode % 10 == 1:
        if self.name == 'worker_0':
            saver.save(self.sess,
                       self.model_path + "/model-" + str(episode) + ".ckpt")

    def noise_action(self, state, decay):
        # Select action a_t according to the current policy and exploration noise which gradually vanishes
        action = self.actor_network.action(self.sess, state)
        return action + self.exploration_noise.noise() * decay

    def action(self, state):
        action = self.actor_network.action(self.sess, state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE and self.training:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def work(self, coord, saver):
        if self.training:
            episode_count = self.sess.run(self.global_episodes)
        else:
            episode_count = 0
        wining_episode_count = 0
        total_steps = 0
        print("Starting worker_" + str(self.number))

        with self.sess.as_default(), self.sess.graph.as_default():
            #not_start_training_yet = True
            while not coord.should_stop():
                returns = []
                rewards = []
                episode_reward = 0

                if np.random.rand(
                ) < 0.9:  # change Aug20 stochastic apply noise
                    noisy = True
                    self.decay -= 1. / self.explore
                else:
                    noisy = False

                self.sess.run(self.update_local_ops_actor)
                self.sess.run(self.update_local_ops_critic)

                state = self.env.reset(difficulty=self.setting)
                #print(observation)
                s = process_frame(state)

                print "episode:", episode_count
                # Train

                for step in xrange(self.env.spec.timestep_limit):
                    state = process_frame(state)
                    if noisy:
                        action = np.clip(
                            self.noise_action(state, np.maximum(self.decay,
                                                                0)), 0.0, 1.0
                        )  # change Aug20, decay noise (no noise after ep>=self.explore)
                    else:
                        action = self.action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
                    next_state = process_frame(next_state)
                    self.perceive(state, action, reward * 100, next_state,
                                  done)
                    state = next_state
                    episode_reward += reward
                    if done:
                        break

                if episode % 5 == 0:
                    print "episode reward:", reward_episode

                # Testing:
                #if episode % 1 == 0:
                if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1:  # change Aug19
                    self.save_model(saver, episode_count)
                    total_return = 0
                    ave_reward = 0
                    for i in xrange(TEST):
                        state = self.env.reset()
                        reward_per_step = 0
                        for j in xrange(self.env.spec.timestep_limit):
                            action = self.action(
                                process_frame(state))  # direct action for test
                            state, reward, done, _ = self.env.step(action)
                            total_return += reward
                        if done:
                            break
                            reward_per_step += (reward -
                                                reward_per_step) / (j + 1)
                        ave_reward += reward_per_step

                    ave_return = total_return / TEST
                    ave_reward = ave_reward / TEST
                    returns.append(ave_return)
                    rewards.append(ave_reward)

                    print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward

                if self.name == 'worker_0' and self.training:
                    sess.run(self.increment)
                episode_count += 1

# All done Stop trail
# Confirm exit
            print('Done ' + self.name)
Exemple #26
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Exemple #27
0
class DDPG:
    """docstring for DDPG"""


    def __init__(self, a_dim, s_dim):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = s_dim
        self.action_dim = a_dim
        self.time_step=0
        self.max_bw = 0.0
        self.max_cwnd = 0.0
        self.min_rtt = 9999999.0

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def learn(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        self.time_step += 1
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        noise = self.exploration_noise.noise()
        # print("noise:" + str(noise))
        return action + noise

    def choose_action(self, state):
        self.time_step += 1
        # print("_______________________choose_action_____________________")
        action = self.actor_network.action(state)
        return action

    def store_transition(self, s, a, r, s_,done,episode_count):

        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        # print("*********************************ADD****************************")
        self.replay_buffer.add(s, a, r, s_, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            if((episode_count+1)%100!= 0):
                self.learn()
                # print("learn!")
            else:
                self.actor_network.save_network(self.time_step)
                self.critic_network.save_network(self.time_step)


        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def extract_observation(self,dataRecorder,subflow_index,state_before):
        # print("extracting...")
        value_dic = dataRecorder.get_latest_data()
        state_after=state_before.reshape(10,5)
        # observation = np.zeros((4))
        observation = np.zeros((5))
        t_cWnd=[0,0]
        t_thr=[0,0]
        t_rtt=[0,0]
        t_loss_rate=[0,0]
        t_unAck=[0,0]
        s0=[0,0,0,0,0]
        state=np.zeros(1)
        for i in range(value_dic["nbOfSubflows"]):
            name = "cWnd" + str(i)
            t_cWnd[i] = value_dic[name]
            name = "rtt"+str(i)
            t_rtt[i] = value_dic[name]
            name = "unAck" + str(i)
            t_unAck[i]=value_dic[name]
            name = "loss_rate" + str(i)
            t_loss_rate[i]=value_dic[name]
            name = "throughput" + str(i)
            t_thr[i]=value_dic[name]

        thr=t_thr[subflow_index]
        s0[0]=t_thr[subflow_index]

        rtt=t_rtt[subflow_index]
        s0[1]=t_rtt[subflow_index]

        cwnd=t_cWnd[subflow_index]
        s0[2]=t_cWnd[subflow_index]

        loss_rate=t_loss_rate[subflow_index]
        s0[3]=t_loss_rate[subflow_index]

        unAck=t_unAck[subflow_index]
        s0[4]=t_unAck[subflow_index]


        s0=np.array(s0)
        min_=s0-s0

        thr_n=s0[0]
        thr_n_min=s0[0]-min_[0]
        rtt_min=s0[1]-min_[1]
        cwnd_n_min=s0[2]-min_[2]
        loss_rate_n_min=s0[3]-min_[3]
        unAck_n_min=s0[4]-min_[4]

        # loss_rate_n_min=s0[7]-min_[7]

        if self.max_bw<thr_n_min:
            self.max_bw=thr_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.max_cwnd<cwnd_n_min:
            self.max_cwnd=cwnd_n_min
        if self.min_rtt>rtt_min:
            self.min_rtt=rtt_min

        
        reward  = thr_n_min-5*(rtt_min-self.min_rtt)-10*loss_rate_n_min
        print("reward:"+str(reward)+" thr_n_min:"+str(thr_n_min)+ " rtt_min:"+str(rtt_min)+" self.min_rtt :"+str(self.min_rtt)+"  delta_rtt"+str(rtt_min-self.min_rtt))
        # print("unAck:"+str(unAck_n_min))
        if self.max_bw!=0:
            state[0]=thr_n_min/self.max_bw
            # tmp=pacing_rate_n_min/self.max_bw
            state=np.append(state,[5*loss_rate_n_min])
            state=np.append(state,[unAck_n_min])
        else:
            state[0]=0
            state=np.append(state,[0])
            state=np.append(state,[0])
        state=np.append(state,[1400/cwnd])
        state=np.append(state,[self.min_rtt/rtt_min])

        state_after=np.delete(state_after,[0],axis = 0)
        state_after=np.append(state_after,state)
        

        return state_after,reward,thr_n_min,rtt_min