Example #1
0
class ThreadReplay(Thread):
    def __init__(self, server):
        super(ThreadReplay, self).__init__()
        self.setDaemon(True)

        self.server = server
        self.exit_flag = False

        self.replay_buffer = ReplayBuffer(buffer_size=Config.REPLAY_BUFFER_SIZE, \
                                          random_seed=Config.REPLAY_BUFFER_RANDOM_SEED)

    def update_stats(self):
        self.server.stats.replay_memory_size.value = self.replay_buffer.size()

    def run(self):
        #print("thread started: " + str(self.id))
        while not self.exit_flag:
            # if queue is near empty put a batch there
            if self.server.replay_q.qsize() < Config.REPLAY_MIN_QUEUE_SIZE:
                if self.replay_buffer.size() > Config.TRAINING_MIN_BATCH_SIZE:
                    x__, r__, a__, x2__, done__ = \
                        self.replay_buffer.sample_batch(Config.TRAINING_MIN_BATCH_SIZE)
                    self.server.replay_q.put((x__, r__, a__, x2__, done__))
            x_, r_, a_, x2_, done_ = self.server.training_q.get()
            # replay memory uses experiences individually
            for i in range(x_.shape[0]):
                self.replay_buffer.add(x_[i], a_[i], r_[i], done_[i], x2_[i])
            self.update_stats()
        # cleaning
        self.replay_buffer.clear()
Example #2
0
def train(sess, env, args, actor, critic):
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'] +  " actor_lr" + str(args['actor_lr']) + " critic_lr" + str(args["critic_lr"]), sess.graph)

    actor.update_target_network()
    critic.update_target_network()

    replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed']))

    for i in range(int(args['max_episodes'])):
        state = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):
            action = actor.predict([state])[0]

            state2, reward, done, info = env.step(action)
            reward = np.sum(reward) / NUM_AGENTS

            replay_buffer.add(state, action, reward, done, state2)

            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # TODO
                # Calculate targets
                # target_q = critic.predict_target(
                #     s2_batch, actor.predict_target(s2_batch))

                target_q = tf.zeros((1))

                # Update the critic given the targets
                predicted_q_value, _, loss = critic.train(s_batch, a_batch,
                                                          np.reshape(r_batch, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                actor.update_target_network()
                critic.update_target_network()

                replay_buffer.clear()

                # Log
                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: np.mean(r_batch),
                    summary_vars[1]: ep_ave_max_q / float(j + 1),
                    summary_vars[2]: loss
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch),
                                                                               i, (ep_ave_max_q / float(j + 1))))

            state = state2
            ep_reward += reward

            if done:
                break
Example #3
0
class DDPG(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_bounds,
                 gamma=0.99,
                 sess=None):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        self.action_mean = (action_bounds[0] + action_bounds[1]) * 0.5
        self.action_scale = (action_bounds[1] - action_bounds[0]) * 0.5

        self.batch_size = 5

        self.replay_buffer = ReplayBuffer(1000000,
                                          state_dim=state_dim,
                                          action_dim=action_dim)

        if sess == None:
            self.sess = tf.InteractiveSession()
        else:
            self.sess = sess

        self.actor = ActorModel(state_dim, action_dim, self.action_mean,
                                self.action_scale, self.sess)
        self.critic = CriticModel(state_dim, action_dim, self.sess)

        self.reset_policy()

        writer = tf.summary.FileWriter('logs', self.sess.graph)
        writer.close()

    def reset_policy(self):
        tf.global_variables_initializer().run()

        self.actor.reset_target_model()
        self.critic.reset_target_model()

        self.train_idx = 0
        self.replay_buffer.clear()

    def curr_policy(self):
        return self.actor.get_action

    def save_model(self, filename='/tmp/model.ckpt'):
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, filename)
        print("Model saved in file: %s" % filename)

    def load_model(self, filename='/tmp/model.ckpt'):
        saver = tf.train.Saver()
        saver.restore(self.sess, filename)
        print("Model loaded from file: %s" % filename)

    def update(self, env, get_state, max_iter=1000):
        state = env.reset()

        total_reward = 0
        rand_process = OrnsteinUhlenbeckProcess(dt=1.0,
                                                theta=0.15,
                                                sigma=0.2,
                                                mu=np.zeros(self.action_dim),
                                                x0=np.zeros(self.action_dim))
        for i in range(max_iter):
            # get action
            action = self.actor.get_action(state)
            # generate random noise for action
            action_noise = rand_process.get_next()
            action += action_noise
            action = np.clip(action, self.action_mean - self.action_scale,
                             self.action_mean + self.action_scale)
            # action = np.array([action.squeeze()])

            [new_state, reward, done, _] = env.step(action)
            new_state = np.reshape(new_state, (1, self.state_dim))
            self.replay_buffer.insert(state, action, reward, new_state, done)

            total_reward += reward
            state = new_state

            if self.train_idx >= (self.batch_size * 3):
                sample = self.replay_buffer.sample(self.batch_size)

                # get target actions
                target_actions = self.actor.get_target_action(
                    sample['next_state'])
                target_q_vals = self.critic.get_target_q_val(
                    sample['next_state'], target_actions)

                disc_return = sample['reward'] + \
                    self.gamma * target_q_vals.squeeze() * (1.0 - sample['terminal'])

                # update critic network
                loss = self.critic.train(sample['state'], sample['action'],
                                         disc_return)

                # get actions grads from critic network
                action_grads = self.critic.get_action_grads(
                    sample['state'], sample['action'])[0]

                # update actor network
                self.actor.train(sample['state'], action_grads)

                # # update target networks
                self.actor.update_target_model()
                self.critic.update_target_model()

            if done:
                break

            self.train_idx += 1

        return total_reward
Example #4
0
class ActorCritic(object):
    def __init__(self, env, device):
        self.s_dim = env.s_dim
        self.a_dim = env.a_dim
        self.env_epi_length = env.nT

        self.device = device

        self.replay_buffer = ReplayBuffer(env,
                                          device,
                                          buffer_size=self.env_epi_length,
                                          batch_size=self.env_epi_length)
        self.initial_ctrl = InitialControl(env, device)

        self.actor_rbfnet = rbf.RBF(self.s_dim, ACTOR_BASIS_NUMBERS,
                                    BASIS_FCNS)  # (T, S) --> (T, F)
        self.actor_f_dim = ACTOR_BASIS_NUMBERS

        self.critic_rbfnet = rbf.RBF(self.s_dim, ACTOR_BASIS_NUMBERS,
                                     BASIS_FCNS)  # (T, S) --> (T, F)
        self.critic_f_dim = CRITIC_BASIS_NUMBERS

        self.actor_mu = np.zeros([self.actor_f_dim, self.a_dim])  # (F, A)
        self.actor_sigma = np.zeros([self.actor_f_dim, self.a_dim])  # (F, A)
        self.critic_theta = np.zeros([self.critic_f_dim, 1])  # (F, 1)

    def ctrl(self, epi, step, x, u):
        if epi < INITIAL_POLICY_INDEX:
            a_val = self.initial_ctrl.controller(step, x, u)
        else:
            a_val = self.choose_action(epi, step, x)

        a_val = np.clip(a_val, -2, 2)
        return a_val

    def choose_action(self, epi, step, x):

        if step == 0:
            actor_phi = self.actor_rbfnet.eval_basis(x)  # (1, F)
            actor_mean = self.compute_actor_mean(actor_phi)
            actor_var = self.compute_actor_var(actor_phi)

            self.action_traj = np.random.multivariate_normal(
                actor_mean[0], actor_var,
                [self.env_epi_length]).reshape([-1, self.a_dim])  # (T, A)

        action = self.action_traj[step, :].reshape(1, -1)  # (1, A)
        return action

    def add_experience(self, epi, *single_expr):
        x, u, r, x2, term = single_expr
        self.replay_buffer.add(*[x, u, r, x2, term])

        if term:  # In on-policy method, clear buffer when episode ends
            self.train(epi)
            self.replay_buffer.clear()

    def learning_rate_schedule(self, epi):
        self.alpha_amu = LEARNING_RATE / (1 + epi**0.5)
        self.alpha_asig = LEARNING_RATE / (1 + epi**0.5)
        self.alpha_c = LEARNING_RATE / (1 + epi**0.5)

    def compute_actor_mean(self, actor_phi):
        actor_mean = actor_phi @ self.actor_mu  # (1, F) @ (F, A)
        return actor_mean
        # return np.clip(actor_mean, -1, 1)

    def compute_actor_var(self, actor_phi):
        actor_var = SIGMA * np.diag((np.exp(actor_phi @ self.actor_sigma)**2 +
                                     1E-4)[0])  # (1, F) @ (F, A) --> (A, A)
        return actor_var
        # return np.clip(actor_var, -1, 1)

    def train(self, epi):
        self.learning_rate_schedule(epi)
        s_traj, a_traj, r_traj, s2_traj, term_traj = self.replay_buffer.sample_sequence(
        )  # T-number sequence
        traj_data = list(zip(s_traj, a_traj, r_traj, s2_traj))

        del_actor_mu_sum = 0.
        del_actor_sigma_sum = 0.
        del_critic_weight_sum = 0.
        epi_cost = 0.

        for single_data in reversed(traj_data):
            del_critic_weight, td, mc, epi_cost = self.compute_critic_grad(
                single_data, epi_cost)
            del_actor_mu, del_actor_sigma = self.compute_actor_grad(
                single_data)

            del_actor_mu_sum += del_actor_mu
            del_actor_sigma_sum += del_actor_sigma
            del_critic_weight_sum += del_critic_weight

            del_actor_weight_sum = np.concatenate(
                [del_actor_mu_sum, del_actor_sigma_sum], axis=0)

            # Critic update
            self.critic_theta -= self.alpha_c * del_critic_weight_sum

            # Actor update - Natural policy gradient
            # fisher = del_actor_weight_sum @ del_actor_weight_sum.T
            # try:
            #     fisher_chol = sp.linalg.cholesky(fisher + 1E-4 * np.eye(2 * self.actor_f_dim))
            #     del_actor_weight = sp.linalg.solve_triangular(fisher_chol, sp.linalg.solve_triangular(fisher_chol.T, del_actor_weight_sum, lower=True))  # [2F, A]
            # except np.linalg.LinAlgError:
            #     del_actor_weight = np.linalg.inv(fisher + 1E-2 * np.eye(2 * self.actor_f_dim)) @ del_actor_weight_sum
            #
            #
            # self.actor_mu -= self.alpha_amu * del_actor_weight[:self.actor_f_dim] * td
            # self.actor_sigma -= self.alpha_asig * del_actor_weight[self.actor_f_dim:] * td

            # Actor update - Advantage actor critic, inf hor
            self.actor_mu -= self.alpha_amu * del_actor_mu * td
            self.actor_sigma -= self.alpha_asig * del_actor_sigma * td
            #
        # # Actor update - REINFORCE
        # self.actor_mu -= self.alpha_amu * del_actor_mu_sum * mc
        # self.actor_sigma -= self.alpha_asig * del_actor_sigma_sum * mc

        self.actor_mu = np.clip(self.actor_mu, -10, 10)
        self.actor_sigma = np.clip(self.actor_sigma, -10, 10)
        self.critic_theta = np.clip(self.critic_theta, -10, 10)

        print(np.linalg.norm(self.actor_mu), np.linalg.norm(self.actor_sigma),
              np.linalg.norm(self.critic_theta))

    def compute_critic_grad(self, single_data, epi_cost):
        x, u, r, x2 = [_.reshape([1, -1]) for _ in single_data]

        critic_phi = self.critic_rbfnet.eval_basis(x)  # (1, F)
        critic_phi_next = self.critic_rbfnet.eval_basis(x2)  # (1, F)

        V_curr = np.clip(critic_phi @ self.critic_theta, 0., 5.)
        V_next = np.clip(critic_phi_next @ self.critic_theta, 0., 5.)

        td = r + GAMMA * V_next - V_curr  # (1, 1)

        del_critic_weight = (-critic_phi).T @ td  # (F, 1)

        epi_cost = GAMMA * epi_cost + r
        mc = epi_cost - V_curr
        return del_critic_weight, td, mc, epi_cost

    def compute_actor_grad(self, single_data):
        x, u, r, x2 = [_.reshape([1, -1]) for _ in single_data]

        actor_phi = self.actor_rbfnet.eval_basis(x)  # (1, F)
        eps = u - self.compute_actor_mean(actor_phi)  # (1, F) @ (F, A)
        actor_var_inv = np.linalg.inv(
            self.compute_actor_var(actor_phi))  # (A, A)

        dlogpi_dmu = actor_phi.T @ eps @ actor_var_inv  # (F, 1) @ (1, A) @ (A, A)
        dlogpi_dsigma = SIGMA * np.repeat(actor_phi, self.a_dim, axis=0).T @ (
            eps.T @ eps @ actor_var_inv - np.eye(self.a_dim)
        )  # (F, A) @ (A, A)

        return dlogpi_dmu, dlogpi_dsigma
Example #5
0
class DQN(object):
    def __init__(self,
                 state_dim,
                 num_actions,
                 eps_anneal,
                 gamma=0.99,
                 update_freq=100,
                 sess=None):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.gamma = gamma
        self.eps_anneal = eps_anneal
        self.update_freq = update_freq

        self.batch_size = 64

        self.replay_buffer = ReplayBuffer(3000,
                                          state_dim=state_dim,
                                          action_dim=1)
        self.__build_model()

        if sess == None:
            self.sess = tf.InteractiveSession()
        else:
            self.sess = sess

        self.reset_policy()

        writer = tf.summary.FileWriter('logs', self.sess.graph)
        writer.close()

    def reset_policy(self):
        tf.global_variables_initializer().run()
        self.train_idx = 0
        self.replay_buffer.clear()
        self.eps_anneal.reset()

    def __build_q_func(self, input_var, name, reuse=False):
        with tf.variable_scope(name, reuse=reuse) as scope:
            layer1 = tf.contrib.layers.fully_connected(
                input_var, 32, activation_fn=tf.nn.relu, scope='layer1')
            layer2 = tf.contrib.layers.fully_connected(
                layer1, 16, activation_fn=tf.nn.relu, scope='layer2')
            q_vals = tf.contrib.layers.fully_connected(layer2,
                                                       self.num_actions,
                                                       activation_fn=None,
                                                       scope='q_vals')
        return q_vals

    def __build_model(self):
        # forward model
        self.states = tf.placeholder(tf.float32, [None, self.state_dim],
                                     name='states')
        self.actions = tf.placeholder(tf.int32, [None], name='actions')
        self.action_q_vals = self.__build_q_func(self.states,
                                                 name='action_q_func')
        self.output_actions = tf.argmax(self.action_q_vals,
                                        axis=1,
                                        name='output_actions')
        self.sampled_q_vals = tf.reduce_sum(tf.multiply(
            self.action_q_vals, tf.one_hot(self.actions, self.num_actions)),
                                            1,
                                            name='sampled_q_vals')

        self.target_q_vals = self.__build_q_func(self.states,
                                                 name='target_q_func')
        self.max_q_vals = tf.reduce_max(self.target_q_vals,
                                        axis=1,
                                        name='max_q_vals')

        # loss
        self.rewards = tf.placeholder(tf.float32, [None], name='rewards')
        self.terminal = tf.placeholder(tf.float32, [None], name='terminal')
        self.q_vals_next_state = tf.placeholder(tf.float32, [None],
                                                name='q_vals_next_state')

        self.terminal_mask = tf.subtract(1.0, self.terminal)

        self.disc_return = tf.add(self.rewards,
                                  tf.multiply(
                                      self.terminal_mask,
                                      tf.multiply(self.gamma,
                                                  self.q_vals_next_state)),
                                  name='disc_return')

        self.td_error = tf.subtract(self.disc_return,
                                    self.sampled_q_vals,
                                    name='td_error')
        self.loss = tf.reduce_mean(tf.square(self.td_error), name='loss')
        self.optimizer = tf.train.RMSPropOptimizer(0.00025).minimize(self.loss)

        # updating target network
        var_sort_lambd = lambda x: x.name
        self.action_q_vars = sorted(tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='action_q_func'),
                                    key=var_sort_lambd)
        self.target_q_vars = sorted(tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func'),
                                    key=var_sort_lambd)

        update_target_ops = []
        for action_q, target_q in zip(self.action_q_vars, self.target_q_vars):
            update_target_ops.append(target_q.assign(action_q))
        self.update_target_ops = tf.group(*update_target_ops,
                                          name='update_target_ops')

    def __update_target_network(self):
        self.sess.run(self.update_target_ops)

    def get_action(self, state):
        sample = np.random.random_sample()
        if sample > self.eps_anneal.eps:
            fd = {self.states: np.array([state])}
            output_action = self.sess.run(self.output_actions, feed_dict=fd)
            action = np.asscalar(output_action)
        else:
            action = np.random.randint(self.num_actions)

        return action

    def curr_policy(self):
        return partial(DQN.get_action, self)

    def save_model(self, filename='/tmp/model.ckpt'):
        saver = tf.train.Saver()
        save_path = saver.save(self.sess, filename)
        print("Model saved in file: %s" % filename)

    def load_model(self, filename='/tmp/model.ckpt'):
        saver = tf.train.Saver()
        saver.restore(self.sess, filename)
        print("Model loaded from file: %s" % filename)

    def update(self, env, get_state, max_iter=1000):
        state = env.reset()

        action = self.get_action(state)

        total_reward = 0
        for i in range(max_iter):
            [new_state, reward, done, _] = env.step(action)
            total_reward += reward

            self.replay_buffer.insert(state, action, reward, new_state, done)

            state = new_state

            if self.train_idx >= self.batch_size:
                sample = self.replay_buffer.sample(self.batch_size)

                # get max q values of next state
                fd = {self.states: sample['next_state']}
                max_q_vals = self.sess.run(self.max_q_vals, feed_dict=fd)

                fd = {
                    self.states: sample['state'],
                    self.actions: sample['action'].squeeze(),
                    self.rewards: sample['reward'],
                    self.terminal: sample['terminal'],
                    self.q_vals_next_state: max_q_vals
                }

                loss, _ = self.sess.run([self.loss, self.optimizer],
                                        feed_dict=fd)

                if self.train_idx % self.update_freq == 0:
                    self.__update_target_network()
            if done:
                break

            action = self.get_action(state)
            self.train_idx += 1

        self.eps_anneal.update()
        return total_reward
Example #6
0
def train(sess, args, actor, critic):
    plt.ion()  #开启interactive mode
    speedmode = 6
    madr = 1.4
    gapvector = [0] * 16
    totalreward = []

    le = 10000
    options = get_options()
    if options.nogui:
        sumoBinary = checkBinary('sumo')
    else:
        sumoBinary = checkBinary('sumo-gui')
    leading = []

    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(
        args['summary_dir'] + " actor_lr" + str(args['actor_lr']) +
        " critic_lr" + str(args["critic_lr"]), sess.graph)

    actor.update_target_network()
    critic.update_target_network()

    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    for i in range(1200):
        #        print(i)
        zongreward = 0
        locationplot = []
        speedplot = []
        timeplot = []
        traci.start([sumoBinary, "-c", "hello.sumocfg"])
        #        print('shenme')
        locationplot = []
        speedplot = []

        timeplot = []
        done = 0
        chusudu = 14
        for i in range(0, 40):
            leading.append(0)
        for i in range(40, 70):
            leading.append(-1)
        for i in range(70, 200):
            leading.append(1)

        for step in range(100):
            exist_list = traci.vehicle.getIDList()
            if len(exist_list) > 0:
                traci.vehicle.setSpeed(exist_list[0], chusudu)
            traci.simulationStep()
        gapvector = [2 * chusudu] * 16
        #        print(gapvector)
        traci.vehicle.moveTo('a', 'L4_0', le)
        traci.vehicle.moveTo('b.0', 'L4_0', le - gapvector[0])
        traci.vehicle.moveTo('b.1', 'L4_0', le - sum(gapvector[:2]))
        traci.vehicle.moveTo('b.2', 'L4_0', le - sum(gapvector[:3]))
        traci.vehicle.moveTo('b.3', 'L4_0', le - sum(gapvector[:4]))
        traci.vehicle.moveTo('b.4', 'L4_0', le - sum(gapvector[:5]))
        traci.vehicle.moveTo('b.5', 'L4_0', le - sum(gapvector[:6]))
        traci.vehicle.moveTo('b.6', 'L4_0', le - sum(gapvector[:7]))
        traci.vehicle.moveTo('b.7', 'L4_0', le - sum(gapvector[:8]))
        traci.vehicle.moveTo('c.0', 'L4_0', le - sum(gapvector[:9]))
        traci.vehicle.moveTo('c.1', 'L4_0', le - sum(gapvector[:10]))
        traci.vehicle.moveTo('c.2', 'L4_0', le - sum(gapvector[:11]))
        traci.vehicle.moveTo('c.3', 'L4_0', le - sum(gapvector[:12]))
        traci.vehicle.moveTo('c.4', 'L4_0', le - sum(gapvector[:13]))
        traci.vehicle.moveTo('c.5', 'L4_0', le - sum(gapvector[:14]))
        traci.vehicle.moveTo('c.6', 'L4_0', le - sum(gapvector[:15]))
        traci.vehicle.moveTo('c.7', 'L4_0', le - sum(gapvector[:16]))
        traci.simulationStep()
        chushiweizhi = []
        exist_list = traci.vehicle.getIDList()
        for xx in exist_list:
            chushiweizhi.append(traci.vehicle.getPosition(xx)[0])

        touche = leading

        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):
            #            pjz=0

            initialsp = []
            state2 = []
            state = []
            reward = []
            #            print()
            xiayimiaosudu = np.clip(
                traci.vehicle.getSpeed(exist_list[0]) + touche[j], 0, chusudu)
            traci.vehicle.setSpeed(exist_list[0], xiayimiaosudu)
            for xx in exist_list:
                traci.vehicle.setSpeedMode(xx, speedmode)
                initialsp.append(traci.vehicle.getSpeed(xx))
                locationplot.append(traci.vehicle.getPosition(xx)[0] / 1000)
                speedplot.append(traci.vehicle.getSpeed(xx))
                timeplot.append(j)

            for mm in range(1, NUM_AGENTS + 1):
                #                touchea=exist_list[0]
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                gap = traci.vehicle.getLeader(ziji)[1]
                zhuangtai1 = (traci.vehicle.getSpeed(qianche) -
                              traci.vehicle.getSpeed(ziji)) / 10
                zhuangtai2 = (traci.vehicle.getSpeed(ziji) - 16) / 16
                zhuangtai3 = (math.sqrt(max(gap, 0)) - 20) / 20
                state.append([zhuangtai1, zhuangtai2, zhuangtai3])

            action = actor.predict([state])[0]
            chaoguo = [0] * NUM_AGENTS
            for mm in range(1, NUM_AGENTS + 1):
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                zijisudu = traci.vehicle.getSpeed(ziji)
                qianchesudu = traci.vehicle.getSpeed(qianche)
                gapa = traci.vehicle.getLeader(ziji)[1]
                if qianchesudu - 3 < zijisudu:
                    gap = gapa - 5 - zijisudu + max(qianchesudu - 3, 0)
                    if gap < 0:
                        amax = -3
#                        print(gap)
                    else:
                        #                        amax=math.sqrt(madr*gap)+sp[i]-sp[i+1]-3
                        amax = min(gap / 3, math.sqrt(
                            madr * gap)) + qianchesudu - zijisudu - 3
                        amax = np.clip(amax, -3, 3)
                else:
                    amax = 3
#                ac=np.clip(action[mm-1][0]/10,-3,3)
#                if pjz==0:
#                    ave=sum(action)/NUM_AGENTS
#                    pjz=1
                ac = np.clip(action[mm - 1][0] / 10, -3, 3)
                #                print(j,ave,action,ac)
                if ac > amax:
                    chaoguo[mm - 1] = 1
#                print(action[mm-1][0])
#                print(j,mm,ac,amax)
                nextspeed = traci.vehicle.getSpeed(exist_list[mm]) + min(
                    amax, ac)
                #                nextspeed=traci.vehicle.getSpeed(exist_list[mm])+ac
                #                print(action[mm-1][0])
                traci.vehicle.setSpeed(exist_list[mm], nextspeed)
            traci.simulationStep()
            #            for i in NUM_AGENTS+1):
            #                if i>0 and (po[i]>po[i-1]-5 or po[i]<-10000):
            #                    chongtu[i-1]=1
            chongtu = [0] * NUM_AGENTS
            #            print(j)
            for mm in range(1, NUM_AGENTS + 1):
                ziji = exist_list[mm]
                qianche = exist_list[mm - 1]
                #                print(traci.vehicle.getPosition(ziji)[0])
                if traci.vehicle.getPosition(ziji)[0] < -10000:
                    chongtu[mm - 1] = 1
                re = min((traci.vehicle.getAcceleration(ziji))**2 / 9, 1)
                #                print(mm-1,traci.vehicle.getAcceleration(ziji),re)
                if chongtu[mm - 1] == 0:
                    gap = traci.vehicle.getLeader(ziji)[1]
                else:
                    gap = 0
                if gap > 100:
                    re += gap / 100
#                print(mm-1,gap,re)
                if chaoguo[mm - 1] == 1:
                    re += 1
                if chongtu[mm - 1] == 1:
                    re += 5


#                    print('chaoguo'W)
#                print(mm-1,chaoguo[mm-1],re)

                reward.append([1 - re])
                done = True
            state2 = None

            replay_buffer.add(state, action, reward, done, state2)
            #            print(reward)

            if replay_buffer.size() > int(
                    args['minibatch_size']) or sum(chongtu) > 0:

                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))
                #                print(j)
                #                print(chongtu)
                if j % 33 == 32:
                    predicted_q_value, _, loss = critic.train(
                        s_batch, a_batch,
                        np.reshape(r_batch, (32, NUM_AGENTS, 1)))
                else:
                    predicted_q_value, _, loss = critic.train(
                        s_batch, a_batch,
                        np.reshape(r_batch, (j % 33 + 1, NUM_AGENTS, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads)

                actor.update_target_network()
                critic.update_target_network()
                #                print('xunlianle')

                replay_buffer.clear()

                # Log
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]:
                                           np.mean(r_batch),
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j + 1),
                                           summary_vars[2]:
                                           loss
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()
                #                print(j,reward,r_batch,np.mean(r_batch))

                state = []
                reward = []

                #                print('| Reward: {:.4f} | Episode: {:d} | Qmax: {:.4f}'.format(np.mean(r_batch),
                #                                                                               i, (ep_ave_max_q / float(j + 1))))
                zongreward += np.mean(r_batch)
                print(j, action, chaoguo)
            if sum(chongtu) > 0:
                print(traci.vehicle.getIDCount())
                print('zhuangle22222222222222222222222222')
                replay_buffer.clear()
                traci.close()
                sys.stdout.flush()
                #                bre=1
                break

        replay_buffer.clear()
        traci.close()
        sys.stdout.flush()
        #        print(ave)
        #            if state2!=None:
        #                print(state,action,reward,state2)
        #        print(totalreward,zongreward)
        print(j, zongreward / 9 - 1)
        if j > 180:
            totalreward.append(zongreward / 9 - 1)
        plt.ion()
        plt.figure(i * 2 - 1)
        plt.plot(np.arange(len(totalreward)), totalreward)
        plt.xlabel('Episode')
        plt.ylabel('Episode reward')
        plt.draw()
        plt.pause(1)
        plt.close()  #越大越好

        plt.ion()
        plt.figure(i * 2)
        plt.scatter(timeplot, locationplot, c=speedplot, s=10, alpha=0.3)
        plt.colorbar()
        plt.xlabel('Time (s)')
        plt.ylabel('Location (km)')
        plt.grid(True)
        plt.show()

    M8 = np.mat(totalreward)
    np.savetxt("M8.csv", M8, delimiter=',')