コード例 #1
0
ファイル: apl_ddpg.py プロジェクト: mdheller/costar_plan
class APLDDPGAgent(AbstractAgent):

    name = "apl_ddpg"

    def __init__(self, env, iter=200000, *args, **kwargs):
        # create the actor model
        # create the critic model
        self.env = env
        self.action_dim = sum(
            sum(1 for i in row if i) for row in self.env.action_space.sample())
        self.observation = env.reset()
        self.state_dim = self.observation.shape
        print ">>>>>>>>>>>>>>>>>>>>>state dim " + str(self.state_dim)
        self.nn_action_dim = 6  # limit ddpg network output to 3 DOF
        self.noise = OUProcess(self.nn_action_dim,
                               mu=OU_MEAN,
                               theta=OU_THETA,
                               sigma=EPSILON_RANGE[0])

    def fit(self, *args, **kwargs):

        MEM_SZ = MEM_SIZE_FCL

        sess = K.get_session()
        K.set_learning_phase(1)

        self.actor = ActorNetwork(sess,
                                  self.state_dim,
                                  self.nn_action_dim,
                                  BATCH_SIZE,
                                  TAU,
                                  LRA,
                                  convolutional=CONVOLUTIONAL,
                                  output_activation=ACTION_ACTIVATION)
        self.critic = CriticNetwork(sess,
                                    self.state_dim,
                                    self.nn_action_dim,
                                    BATCH_SIZE,
                                    TAU,
                                    LRC,
                                    convolutional=CONVOLUTIONAL)

        self.memory = Memory(MEM_SZ)

        self.actor.target_model.summary()
        self.critic.target_model.summary()

        if LOAD_WEIGHTS:
            self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                          "actor_model_" +
                                          LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                           "critic_model_" +
                                           LOAD_WEIGHTS_EPISODE + ".h5")
            self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                 "actor_target_model_" +
                                                 LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                  "critic_target_model_" +
                                                  LOAD_WEIGHTS_EPISODE + ".h5")
            print("Weights Loaded!")

        #====================================================
        #Initialize noise processes
        #self.noise_procs = []
        #for i in range(NUM_NOISE_PROCS):
        #    self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV))

        #====================================================

        PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS
        steps = STARTING_EPISODE * EPISODE_LENGTH
        start_time = time.time()
        last_ep_time = time.time()
        if MAKE_PLOT:
            reward_graph = Grapher()

        for ep in range(STARTING_EPISODE, EPISODES):

            #reset noise processes
            #for ou in self.noise_procs:
            #    ou.reset()

            self.noise.reset()

            #start time counter
            if (ep == PRE_LEARNING_EPISODES):
                start_time = time.time()

            print("Episode: " + str(ep) + "  Frames: " +
                  str(ep * EPISODE_LENGTH) + "  Uptime: " + str(
                      (time.time() - start_time) / 3600.0) +
                  " hrs    ===========")

            state = self.env.reset()

            play_only = (ep % 10 == 0)

            total_reward = 0

            if play_only or ALREADY_TRAINED:
                for step in range(TEST_EPISODE_LENGTH):

                    #print ">>>>>>>>>>>>>", state.shape
                    #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center
                    #img = np.multiply(img, 1.0/128.0) #scale [-1,1]
                    #img = np.transpose(state, (1,2,0))

                    #img = np.array(state)
                    #img = np.transpose(img, (1,2,0))

                    #print ">>>>>>>>>>>>>", state.shape

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(
                        state, can_be_random=False, use_target=True)

                    nstate, reward, done, info = self.env.step(control_action)
                    total_reward += reward
                    state = nstate
            else:
                for step in range(EPISODE_LENGTH):

                    # ACT ==============================
                    epsilon = (float(steps) / float(EPSILON_STEPS)) * (
                        EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0]

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(state,
                                                               epsilon=epsilon)
                    new_state, reward, done, info = self.env.step(
                        control_action)
                    done = done or (step >= EPISODE_LENGTH)
                    self.memory.addMemory(state, action, reward, new_state,
                                          done)
                    state = new_state

                    # LEARN ============================
                    if ep > PRE_LEARNING_EPISODES:
                        batch, idxs = self.memory.getMiniBatch(BATCH_SIZE)
                        self.learnFromBatch(batch)

                    if done:
                        break
                    # CLEANUP ==========================
                    steps += 1

            #we need to consider the episodes without noise to actually tell how the system is doing
            if play_only and MAKE_PLOT:
                reward_graph.addSample(total_reward)
                reward_graph.displayPlot()

            #calculate fph on total frames
            total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH
            elapsed = time.time() - start_time
            fps = total_frames / elapsed
            fph = fps * 3600.0

            #re-calculate fps on this episode, so it updates quickly
            fps = EPISODE_LENGTH / (time.time() - last_ep_time)
            last_ep_time = time.time()
            print("fps: " + str(fps) + "  fph: " + str(fph) + "\n")

            #save plot and weights
            if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY
                    == 0) and not ALREADY_TRAINED:

                #plot
                if MAKE_PLOT:
                    reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" +
                                          str(ep) + ".jpg")

                #weights
                self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX +
                                              "actor_model_" + str(ep) + ".h5",
                                              overwrite=True)
                self.actor.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)
                self.critic.model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5",
                    overwrite=True)
                self.critic.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)

                #network structures (although I don't think I ever actually use these)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.target_model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_target_model_" +
                        str(ep) + ".json", "w") as outfile:
                    json.dump(self.critic.target_model.to_json(), outfile)

    def learnFromBatch(self, miniBatch):

        dones = np.asarray([sample['isFinal'] for sample in miniBatch])
        states = np.asarray([sample['state'] for sample in miniBatch])
        actions = np.asarray([sample['action'] for sample in miniBatch])
        new_states = np.asarray([sample['newState'] for sample in miniBatch])
        Y_batch = np.asarray([sample['reward'] for sample in miniBatch])

        new_states = np.reshape(new_states, new_states.shape + (1, ))

        target_q_values = self.critic.target_model.predict(
            [new_states,
             self.actor.target_model.predict(new_states)])

        for i in range(len(miniBatch)):
            if not dones[i]:
                Y_batch[i] = Y_batch[i] + GAMMA * target_q_values[i]

        self.critic.model.train_on_batch([states, actions], Y_batch)

        #additional operations to train actor
        temp_actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, temp_actions)
        self.actor.train(states, grads)

        #update target networks
        self.actor.target_train()
        self.critic.target_train()

    ''' This is wrong I think
    def OU(x, mu, theta, sigma):
        return theta * (mu - x) + sigma * np.random.randn(1)
    '''

    def clip(self, x, minx, maxx):
        return max(minx, min(maxx, x))

    def selectAction(self,
                     state,
                     can_be_random=True,
                     use_target=False,
                     epsilon=1.0,
                     permutation_num=0):
        state = np.array([state])  #add dimension to make a "batch" of 1

        if use_target:
            actions = self.actor.target_model.predict(state)
        else:
            actions = self.actor.model.predict(state)

        actions = np.squeeze(actions)

        #print control_actions

        #print("+++++++++++")
        #print(actions)

        if can_be_random:
            self.noise.sigma = epsilon
            noise = self.noise.noise()
            #print noise

            i = 0
            for idx, a in enumerate(actions):
                actions[i] = actions[i] + noise[i]
                actions[i] = self.clip(
                    actions[i], -3.14,
                    3.14)  #need to assign to actions[i], not just a.
                i += 1

            #get noise
            #noise = []
            #iterate over all noise procs for non-coop, or a single agent's procs for co-op
            #for n in range(permutation_num*ACTIONS_PER_AGENT, permutation_num*ACTIONS_PER_AGENT + self.action_dim):
            #    ou = self.noise_procs[n]
            #    noise.append(ou.step())

#            for idx, a in enumerate(actions):
#                ou = self.noise_procs[0]
#                noise = ou.step()
#                a = a + epsilon*noise
#                #print epsilon * noise
#                actions[i] = self.clip(a, -3.14, 3.14) #need to assign to actions[i], not just a.
#                i += 1
#
#print(actions)

#fill in zeros for all non-learned outputs
        control_actions = np.pad(actions, (0, self.action_dim - len(actions)),
                                 'constant')
        #print actions
        #print control_actions

        return actions, control_actions

    #Constructs an image from state vector
    def constructImageRepresentation(self, state):
        img = np.empty([IMAGE_SIDE_LENGTH, IMAGE_SIDE_LENGTH], dtype=np.uint8)
        img.fill(128)

        color = 255
        delta_color = int(math.floor(128 / NUM_TARGETS))
        for j in range(NUM_TARGETS):
            tar = [state[2 * j], state[2 * j + 1]]
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       5, 0, -1)
            cv2.circle(img, (int(
                tar[0] * IMAGE_SIDE_LENGTH), int(tar[1] * IMAGE_SIDE_LENGTH)),
                       4, color, -1)
            color -= delta_color

        color = 0
        for j in range(NUM_AGENTS):
            offset = 2 * NUM_TARGETS
            agent = [state[offset + 2 * j], state[offset + 2 * j + 1]]
            #draw blank agent, no thrust display
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 4,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 1),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 4,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 1), color, -1)
            cv2.rectangle(img, (int(agent[0] * IMAGE_SIDE_LENGTH) - 1,
                                int(agent[1] * IMAGE_SIDE_LENGTH) - 4),
                          (int(agent[0] * IMAGE_SIDE_LENGTH) + 1,
                           int(agent[1] * IMAGE_SIDE_LENGTH) + 4), color, -1)
            #first agent ia 0 since we control it, others are same color
            color = 64
        '''
        cv2.namedWindow('perm_image',cv2.WINDOW_NORMAL)
        cv2.resizeWindow('perm_image', 600,600)
        cv2.imshow('perm_image', img)
        cv2.waitKey(1)
        '''

        img = np.array([np.subtract(img, 128)], dtype=np.float32)  #zero center
        img = np.multiply(img, 1.0 / 128.0)  #scale [-1,1]
        img = np.transpose(img, (1, 2, 0))

        return img

    #for co-op case, get an arrangement of the state vector for each agent.
    def getStatePermutations(self, state):
        perms = []
        for i in range(NUM_AGENTS):

            if CONVOLUTIONAL and not DRAW_STATE:
                perms.append(state)
            else:
                pstate = []

                #copy over target data
                for j in range(NUM_TARGETS * 2):
                    pstate.append(state[j])

                #copy agent data, rotated
                for j in range(NUM_AGENTS * 2):
                    rot_j = (j +
                             (i * 2)) % (NUM_AGENTS * 2) + (NUM_TARGETS * 2)
                    pstate.append(state[rot_j])

                if DRAW_STATE:
                    perms.append(constructImageRepresentation(pstate))
                else:
                    perms.append(np.asarray(pstate, dtype=np.float32))

        return perms
コード例 #2
0
ファイル: ddpg.py プロジェクト: ZhichenML/IPPS
def run_ddpg(amodel,
             cmodel,
             train_indicator=0,
             seeded=1337,
             track_name='practgt2.xml'):
    OU = FunctionOU()
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic
    ALPHA = 0.9

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(seeded)

    vision = False

    EXPLORE = 100000.
    if train_indicator:
        episode_count = 600
    else:
        episode_count = 3
    max_steps = 20000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision,
                   throttle=True,
                   gear_change=False,
                   track_name=track_name)

    if not train_indicator:
        # Now load the weight
        #logging.info("Now we load the weight")
        print("Now we load the weight")
        try:
            actor.model.load_weights(amodel)
            critic.model.load_weights(cmodel)
            actor.target_model.load_weights(amodel)
            critic.target_model.load_weights(cmodel)
            #logging.info(" Weight load successfully")
            print("Weight load successfully")
        except:
            #ogging.info("Cannot find the weight")
            print("Cannot find the weight")
            exit()

    #logging.info("TORCS Experiment Start.")
    print("TORCS Experiment Start.")
    best_lap = 500

    for i_episode in range(episode_count):
        print("Episode : " + str(i_episode) + " Replay Buffer " +
              str(buff.count()))
        #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count()))
        if np.mod(i_episode, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        for j_iter in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i_episode, "Step", step, "Action", a_t, "Reward",
                  r_t, "Loss", loss)

            if np.mod(step, 1000) == 0:
                logging.info("Episode {}, Distance {}, Last Lap {}".format(
                    i_episode, ob.distRaced, ob.lastLapTime))
                if ob.lastLapTime > 0:
                    if best_lap < ob.lastLapTime:
                        best_lap = ob.lastLapTime

            step += 1
            if done:
                break

        if train_indicator and i_episode > 20:
            if np.mod(i_episode, 3) == 0:
                logging.info("Now we save model")
                actor.model.save_weights("ddpg_actor_weights_periodic.h5",
                                         overwrite=True)
                critic.model.save_weights("ddpg_critic_weights_periodic.h5",
                                          overwrite=True)

        print("TOTAL REWARD @ " + str(i_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Best Lap {}".format(best_lap))
        print("")
        logging.info("TOTAL REWARD @ " + str(i_episode) +
                     "-th Episode  : Reward " + str(total_reward))
        logging.info("Best Lap {}".format(best_lap))
    env.end()  # This is for shutting down TORCS
    logging.info("Finish.")
コード例 #3
0
class NeuralAgent():
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])

    def rollout(self, env):
        max_steps = 10000

        vision = False

        # zhichen: it is not stable to have two torcs env and UDP connections
        # env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=self.track_name)

        ob = env.reset(relaunch=True)
        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.

        sp = []

        lastLapTime = []

        for j_iter in range(max_steps):

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            a_t = a_t[0]
            # print('test a_t:', a_t)
            a_t[0] = clip(a_t[0], -1, 1)
            a_t[1] = clip(a_t[1], 0, 1)
            a_t[2] = clip(a_t[2], 0, 1)

            ob, r_t, done, info = env.step(a_t)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            if np.mod(j_iter + 1, 20) == 0:
                logging.info('step: ' + str(j_iter + 1))
                print('\n ob: ', ob)

            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t

            if done: break

        logging.info("Test Episode Reward: " + str(total_reward) +
                     " Episode Length: " + str(j_iter + 1) + " Ave Reward: " +
                     str(total_reward / (j_iter + 1)) + "\n Distance: " +
                     str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        #logging.info(" Total Steps: " + str(step) + " " + str(i_episode) + "-th Episode Reward: " + str(total_reward) +
        #            " Episode Length: " + str(j_iter+1) + "  Distance" + str(ob.distRaced) + " Lap Times: " + str(ob.lastLapTime))

        #env.end()  # This is for shutting down TORCS

        ave_sp = np.mean(sp)
        max_sp = np.max(sp)
        min_sp = np.min(sp)

        return total_reward, j_iter + 1, info, ave_sp, max_sp, min_sp, lastLapTime

    def update_neural(self,
                      controllers,
                      episode_count=200,
                      tree=False,
                      seed=1337):
        OU = FunctionOU()
        vision = False
        GAMMA = 0.99
        EXPLORE = 100000.
        max_steps = 10000
        reward = 0
        done = False
        step = 0
        epsilon = 1

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Experiment Start with Lambda = " +
                     str(self.lambda_mix))

        for i_episode in range(episode_count):
            logging.info("Episode : " + str(i_episode) + " Replay Buffer " +
                         str(self.buff.count()))
            if np.mod(i_episode, 3) == 0:
                logging.info('relaunch TORCS')
                ob = env.reset(
                    relaunch=True
                )  # relaunch TORCS every 3 episode because of the memory leak error
            else:
                logging.info('reset TORCS')
                ob = env.reset()

            #[ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, list(ob.wheelSpinVel / 100.0), list(ob.track)]
            s_t = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward = 0.
            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), [0, 0, 0]]
            window_list = [tempObs[:] for _ in range(window)]

            sp = []

            lastLapTime = []

            for j_iter in range(max_steps):
                if tree:
                    tree_obs = [
                        sensor for obs in tempObs[:-1] for sensor in obs
                    ]
                    act_tree = controllers.predict([tree_obs])
                    steer_action = clip_to_range(act_tree[0][0], -1, 1)
                    accel_action = clip_to_range(act_tree[0][1], 0, 1)
                    brake_action = clip_to_range(act_tree[0][2], 0, 1)
                else:
                    steer_action = clip_to_range(
                        steer_prog.pid_execute(window_list), -1, 1)
                    accel_action = clip_to_range(
                        accel_prog.pid_execute(window_list), 0, 1)
                    brake_action = clip_to_range(
                        brake_prog.pid_execute(window_list), 0, 1)
                action_prior = [steer_action, accel_action, brake_action]

                tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                           [ob.speedZ], [ob.rpm],
                           list(ob.wheelSpinVel / 100.0),
                           list(ob.track), action_prior]
                window_list.pop(0)
                window_list.append(tempObs[:])

                loss = 0
                epsilon -= 1.0 / EXPLORE
                a_t = np.zeros([1, self.action_dim])
                noise_t = np.zeros([1, self.action_dim])

                a_t_original = self.actor.model.predict(
                    s_t.reshape(1, s_t.shape[0]))
                noise_t[0][0] = max(epsilon, 0) * OU.function(
                    a_t_original[0][0], 0.0, 0.60, 0.30)
                noise_t[0][1] = max(epsilon, 0) * OU.function(
                    a_t_original[0][1], 0.5, 1.00, 0.10)
                noise_t[0][2] = max(epsilon, 0) * OU.function(
                    a_t_original[0][2], 0, 1.00, 0.05)

                a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
                a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
                a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

                mixed_act = [
                    a_t[0][k_iter] / (1 + self.lambda_mix) +
                    (self.lambda_mix /
                     (1 + self.lambda_mix)) * action_prior[k_iter]
                    for k_iter in range(3)
                ]

                ob, r_t, done, info = env.step(mixed_act)

                sp.append(info['speed'])

                if lastLapTime == []:
                    if info['lastLapTime'] > 0:
                        lastLapTime.append(info['lastLapTime'])
                elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                        'lastLapTime']:
                    lastLapTime.append(info['lastLapTime'])

                s_t1 = np.hstack(
                    (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                     ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

                self.buff.add(s_t, a_t[0], r_t, s_t1,
                              done)  # Add replay buffer

                # Do the batch update
                batch = self.buff.getBatch(self.batch_size)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.zeros((states.shape[0], 1))

                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                loss += self.critic.model.train_on_batch([states, actions],
                                                         y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads)
                self.actor.target_train()
                self.critic.target_train()

                total_reward += r_t
                s_t = s_t1

                # Control prior mixing term
                if j_iter > 0 and i_episode > 50:
                    lambda_track = lambda_max * (1 - np.exp(-factor * np.abs(
                        r_t +
                        GAMMA * np.mean(target_q_values[-1] - base_q[-1]))))
                    lambda_track = np.squeeze(lambda_track)
                else:
                    lambda_track = 10.
                lambda_store[j_iter] = lambda_track
                base_q = copy.deepcopy(target_q_values)

                if np.mod(step, 2000) == 0:
                    logging.info("Episode " + str(i_episode) + " Distance " +
                                 str(ob.distRaced) + " Lap Times " +
                                 str(ob.lastLapTime))

                step += 1
                if done:
                    break

            #else:
            #    env.end()

            self.lambda_mix = np.mean(lambda_store)

            logging.info('Episode ends! \n' + "Total Steps: " + str(step) +
                         " " + str(i_episode) + "-th Episode Reward: " +
                         str(total_reward) + " Episode Length: " +
                         str(j_iter + 1) + " Ave Reward: " +
                         str(total_reward / (j_iter + 1)) + "\n Distance: " +
                         str(info['distRaced']) + ' ' +
                         str(info['distFromStart']) + "\n Last Lap Times: " +
                         str(info['lastLapTime']) + " Cur Lap Times: " +
                         str(info['curLapTime']) + " lastLaptime: " +
                         str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                         " max sp: " + str(np.max(sp)))

            #logging.info(" Lambda Mix: " + str(self.lambda_mix))

            self.save['total_reward'].append(total_reward)
            self.save['total_step'].append(j_iter + 1)
            self.save['ave_reward'].append(total_reward / (j_iter + 1))

            self.save['distRaced'].append(info['distRaced'])
            self.save['distFromStart'].append(info['distFromStart'])

            self.save['lastLapTime'].append(info['lastLapTime'])
            self.save['curLapTime'].append(info['curLapTime'])
            self.save['lapTimes'].append(lastLapTime)
            if lastLapTime == []:
                self.save['avelapTime'].append(0)
            else:
                self.save['avelapTime'].append(np.mean(lastLapTime))

            self.save['ave_sp'].append(np.mean(sp))
            self.save['max_sp'].append(np.max(sp))
            self.save['min_sp'].append(np.min(sp))

            # test
            if np.mod(i_episode + 1, 10) == 0:
                logging.info("Start Testing!")
                test_total_reward, test_step, test_info, test_ave_sp, test_max_sp, test_min_sp, test_lastLapTime = self.rollout(
                    env)
                self.save['test_total_reward'].append(test_total_reward)
                self.save['test_total_step'].append(test_step)
                self.save['test_ave_reward'].append(test_total_reward /
                                                    test_step)

                self.save['test_distRaced'].append(test_info['distRaced'])
                self.save['test_distFromStart'].append(
                    test_info['distFromStart'])

                self.save['test_lastLapTime'].append(test_info['lastLapTime'])
                self.save['test_curLapTime'].append(test_info['curLapTime'])
                self.save['test_lapTimes'].append(test_lastLapTime)

                if test_lastLapTime == []:
                    self.save['test_avelapTime'].append(0)
                else:
                    self.save['test_avelapTime'].append(
                        np.mean(test_lastLapTime))

                self.save['test_ave_sp'].append(test_ave_sp)
                self.save['test_max_sp'].append(test_max_sp)
                self.save['test_min_sp'].append(test_min_sp)

            if np.mod(i_episode + 1, 5) == 0:
                print("Now we save model")
                #os.remove("actormodel.h5")
                self.actor.model.save_weights("actormodel_" + str(seed) +
                                              ".h5",
                                              overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)

                #os.remove("criticmodel.h5")
                self.critic.model.save_weights("criticmodel_" + str(seed) +
                                               ".h5",
                                               overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)

                filename = "./model/actormodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.actor.model.save_weights(filename, overwrite=True)
                filename = "./model/criticmodel_" + str(seed) + '_' + str(
                    i_episode + 1) + ".h5"
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                self.critic.model.save_weights(filename, overwrite=True)

            if np.mod(i_episode + 1, 10) == 0:
                filename = "./Fig/iprl_save_" + str(seed)
                dirname = os.path.dirname(filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(filename, 'wb') as f:
                    pickle.dump(self.save, f)

            if i_episode > 1000 and all(
                    np.array(self.save['total_reward'][-20:]) < 20):
                print('model degenerated. Stop at Epsisode ' + str(i_episode))
                break

        env.end()  # This is for shutting down TORCS
        logging.info("Neural Policy Update Finish.")
        return None

    def collect_data(self, controllers, tree=False):

        vision = False

        max_steps = 10000

        step = 0

        if not tree:
            steer_prog, accel_prog, brake_prog = controllers

        # Generate a Torcs environment
        env = TorcsEnv(vision=vision,
                       throttle=True,
                       gear_change=False,
                       track_name=self.track_name)
        ob = env.reset(relaunch=True)
        print("S0=", ob)

        window = 5
        lambda_store = np.zeros((max_steps, 1))
        lambda_max = 40.
        factor = 0.8

        logging.info("TORCS Collection started with Lambda = " +
                     str(self.lambda_mix))

        s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY,
                         ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

        total_reward = 0.
        tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                   [ob.speedZ], [ob.rpm],
                   list(ob.wheelSpinVel / 100.0),
                   list(ob.track), [0, 0, 0]]
        window_list = [tempObs[:] for _ in range(window)]

        observation_list = []
        actions_list = []

        lastLapTime = []
        sp = []

        for j_iter in range(max_steps):
            if tree:
                tree_obs = [sensor for obs in tempObs[:-1] for sensor in obs]
                act_tree = controllers.predict([tree_obs])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)

            action_prior = [steer_action, accel_action, brake_action]

            tempObs = [[ob.speedX], [ob.angle], [ob.trackPos], [ob.speedY],
                       [ob.speedZ], [ob.rpm],
                       list(ob.wheelSpinVel / 100.0),
                       list(ob.track), action_prior]
            window_list.pop(0)
            window_list.append(tempObs[:])

            a_t = self.actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]
            if tree:
                newobs = [item for sublist in tempObs[:-1] for item in sublist]
                observation_list.append(newobs[:])
            else:
                observation_list.append(window_list[:])
            actions_list.append(mixed_act[:])
            ob, r_t, done, info = env.step(mixed_act)

            sp.append(info['speed'])

            if lastLapTime == []:
                if info['lastLapTime'] > 0:
                    lastLapTime.append(info['lastLapTime'])
            elif info['lastLapTime'] > 0 and lastLapTime[-1] != info[
                    'lastLapTime']:
                lastLapTime.append(info['lastLapTime'])

            s_t1 = np.hstack(
                (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ,
                 ob.rpm, ob.wheelSpinVel / 100.0, ob.track))

            total_reward += r_t
            s_t = s_t1
            #if np.mod(step, 2000) == 0:
            #    logging.info(" Distance " + str(ob.distRaced) + " Lap Times " + str(ob.lastLapTime))

            step += 1
            if done:
                break

        logging.info("Data Collection Finished!")
        logging.info('Episode ends! \n' + "Episode Reward: " +
                     str(total_reward) + " Episode Length: " +
                     str(j_iter + 1) + " Ave Reward: " + str(total_reward /
                                                             (j_iter + 1)) +
                     "\n Distance: " + str(info['distRaced']) + ' ' +
                     str(info['distFromStart']) + "\n Last Lap Times: " +
                     str(info['lastLapTime']) + " Cur Lap Times: " +
                     str(info['curLapTime']) + " lastLaptime: " +
                     str(lastLapTime) + "\n ave sp: " + str(np.mean(sp)) +
                     " max sp: " + str(np.max(sp)))
        env.end()

        return observation_list, actions_list

    def label_data(self, controllers, observation_list, tree=False):
        if not tree:
            steer_prog, accel_prog, brake_prog = controllers
        actions_list = []
        net_obs_list = []
        logging.info("Data labelling started with Lambda = " +
                     str(self.lambda_mix))
        for window_list in observation_list:
            if tree:
                act_tree = controllers.predict([window_list])
                steer_action = clip_to_range(act_tree[0][0], -1, 1)
                accel_action = clip_to_range(act_tree[0][1], 0, 1)
                brake_action = clip_to_range(act_tree[0][2], 0, 1)
                net_obs_list.append(window_list)
            else:
                steer_action = clip_to_range(
                    steer_prog.pid_execute(window_list), -1, 1)
                accel_action = clip_to_range(
                    accel_prog.pid_execute(window_list), 0, 1)
                brake_action = clip_to_range(
                    brake_prog.pid_execute(window_list), 0, 1)
                net_obs = [sensor for obs in window_list[-1] for sensor in obs]
                net_obs_list.append(net_obs[:29])

            action_prior = [steer_action, accel_action, brake_action]

            s_t = np.hstack([[net_obs[:29]]])
            a_t = self.actor.model.predict(s_t.reshape(1, 29))
            mixed_act = [
                a_t[0][k_iter] / (1 + self.lambda_mix) +
                (self.lambda_mix /
                 (1 + self.lambda_mix)) * action_prior[k_iter]
                for k_iter in range(3)
            ]

            actions_list.append(mixed_act[:])

        return net_obs_list, observation_list, actions_list