Beispiel #1
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
Beispiel #2
0
def actionAddNoise(a_t_original, train_indicator, epsilon, numCars=2):
    noise_t = np.zeros_like(a_t_original)
    a_t = np.zeros_like(a_t_original)
    for i in xrange(numCars):
        noise_t[i][0] = train_indicator * max(epsilon, 0) * OU.function(
            a_t_original[i][0], 0.0, 0.6, 0.2)
        noise_t[i][1] = train_indicator * max(epsilon, 0) * OU.function(
            a_t_original[i][1], 0.5, 1.0, 0.10)
        noise_t[i][2] = train_indicator * max(epsilon, 0) * OU.function(
            a_t_original[i][2], 0.3, 1.0, 0.05)

        a_t[i][0] = a_t_original[i][0] + noise_t[i][0]
        a_t[i][1] = a_t_original[i][1] + noise_t[i][1]
        a_t[i][2] = a_t_original[i][2] + noise_t[i][2]
    return a_t
Beispiel #3
0
    def noise_action(self, state, epsilon):
        # return an action according to the current policy and exploration noise
        action = np.zeros([self.action_dim])
        noise = np.zeros([self.action_dim])

        action_pre = self.actor.predict([state])

        noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60)
        noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10)
        noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05)

        # ACTION: with noise
        action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1)
        action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1)
        action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1)

        return action
Beispiel #4
0
    def get_exploration_noiseV1(self, current_value, wheel_side):
        #print("Get noise")

        if(wheel_side == 1):
            self.mu = self.working_point_left
        if(wheel_side == 2):
            self.mu = self.working_point_right


        return OU.function(current_value, self.mu, self.theta, self.sigma)
Beispiel #5
0
 def __init__(self, nodes_num, type, capacity):
     self.nodes_num = nodes_num
     self.prev_traffic = None
     self.type = type
     self.capacity = capacity * nodes_num / (nodes_num - 1)
     self.dictionary = {}
     self.dictionary['NORM'] = self.normal_traffic
     self.dictionary['UNI'] = self.uniform_traffic
     self.dictionary['CONTROLLED'] = self.controlled_uniform_traffic
     self.dictionary['EXP'] = self.exp_traffic
     self.dictionary['OU'] = self.ou_traffic
     self.dictionary['STAT'] = self.stat_traffic
     self.dictionary['STATEQ'] = self.stat_eq_traffic
     self.dictionary['FILE'] = self.file_traffic
     self.dictionary['DIR'] = self.dir_traffic
     self.dictionary['STATIC'] = self.static_traffic
     if self.type.startswith('DIR:'):
         self.dir = sorted(listdir(self.type.split('DIR:')[-1]), key=lambda x: natural_key((x)))
     self.static = None
     self.total_ou = OU(1, self.capacity/2, 0.1, self.capacity/2)
     self.nodes_ou = OU(self.nodes_num**2, 1, 0.1, 1)
Beispiel #6
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0
Beispiel #7
0
    def __init__(self):
        self.OU = OU()

        self.total_correct = 0
        self.total_wrong = 0
        self.accuracy_all = []
        self.if_done = False
        self.epsilon = 1
        self.total_reward = None
        self.loss = None

        self.sim_inter = UpdateInter()
        self.state_t = []
        self.state_dim = self.sim_inter.state_dim
        self.action_t = []
        self.action_acc = None
        self.action_time = None
        self.Tau = self.sim_inter.Tau

        self.actor = None
        self.critic = None
        self.buff = None

        self.batch = None
        self.states = None
        self.actions = None
        self.rewards = None
        self.new_states = None
        self.if_dones = None
        self.y_t = None

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        K.set_session(self.sess)
Beispiel #8
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run

    BUFFER_SIZE = 100000
    BATCH_SIZE = 30
    GAMMA = 0.99
    TAU = 0.0001  #Target Network HyperParameters
    LRA = 0.00001  #Learning rate for Actor
    LRC = 0.0001  #Lerning rate for Critic

    action_dim = 1  #Steering/Acceleration/Brake
    state_dim = 15  #of sensors input

    np.random.seed(1337)
    vision = False

    EXPLORE = 1000000.
    episode_count = 3000
    max_steps = 1000000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    t_dt = 0.0005

    #TCP/IP communication for MATLAB - Python
    HOST = '0.0.0.0'
    PORT = 40000
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4096)
    s.bind((HOST, PORT))
    #Matlab client waiting
    s.listen(1)
    print("waiting for response from client at port ", PORT)
    conn, addr = s.accept()

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    #Now load the weight
    print("Now we load the weight")

    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")

        print("Weight load successfully")

    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0.

        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            Lateral = 0
            #Carsim export(input factor) variable catch s_t
            try:
                ob_exports = conn.recv(4096)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break
            ob_exports1 = json.loads(ob_exports.decode('utf-8'))
            print('export=', ob_exports1)
            if not ob_exports:
                #conn.shutdown()
                conn.close()
                break
            t_current = ob_exports1[0]
            T_bar_Tq = ob_exports1[1] / 10
            LatG = ob_exports1[2]
            YawRate = ob_exports1[3] / 50
            Yaw = ob_exports1[4] / 3.14
            Lateral = ob_exports1[5] / 20
            Steer_SW = ob_exports1[6] / 6000
            StrAV_SW = ob_exports1[7] / 5000
            Steer_L1 = ob_exports1[8] / 180
            Steer_R1 = ob_exports1[9] / 180
            Steer_L2 = ob_exports1[10] / 4
            Steer_R2 = ob_exports1[11] / 4
            Xcg_TM = ob_exports1[12] / 1000
            Ycg_TM = ob_exports1[13] / 300
            Zcg_TM = ob_exports1[14] / 45
            curv = ob_exports1[15]
            #            print('T_bar_Tq=',T_bar_Tq)
            #            print('LatG=',LatG)

            s_t = np.hstack((T_bar_Tq, LatG, YawRate, Yaw, Lateral, Steer_SW,
                             StrAV_SW, Steer_L1, Steer_R1, Steer_L2, Steer_R2,
                             Xcg_TM, Ycg_TM, Zcg_TM, curv))
            print('s_t=', s_t)
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            print('a_t_original=', a_t_original)
            print('a_t_original=', a_t_original)
            a_t_inv = a_t_original[0][0]
            print(a_t_inv.shape)
            critic_gradient = critic.gradients(s_t.reshape(1, s_t.shape[0]),
                                               a_t_original)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.00, 0.00)
            #            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            #            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake

            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            #            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            #            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            a_t[0][0] = a_t[0][0] * 3500

            t_current = t_current + t_dt
            print('t_next=', t_current)
            print(a_t[0])
            at = np.array(a_t[0])
            #            print("at=",at)
            at1 = np.insert(at, 0, t_current)
            #            print('at1=,',at1)
            at2 = list(at1)
            print('at2=,', at2)

            #provide action value to matlab
            try:
                at_json = json.dumps(at2)
                a = '\r\n'
                at_json1 = at_json + a
                #               print('at_json1',at_json1)
                at_json2 = at_json1.encode('utf-8')
                #               print('at_json2',at_json2)
                conn.sendall(at_json2)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break

            #Carsim export(input factor) variable catch s_t1
            try:
                ob_exports = conn.recv(4096)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break
            ob_exports1 = json.loads(ob_exports.decode('utf-8'))
            print('s_t1=', ob_exports1)
            if not ob_exports:
                #conn.shutdown()
                conn.close()
                break
            T_bar_Tq1 = ob_exports1[0] / 10
            LatG1 = ob_exports1[1]
            YawRate1 = ob_exports1[2] / 50
            Yaw1 = ob_exports1[3] / 3.14
            Lateral1 = ob_exports1[4] / 20
            Steer_SW1 = ob_exports1[5] / 6000
            StrAV_SW1 = ob_exports1[6] / 5000
            Steer_L11 = ob_exports1[7] / 180
            Steer_R11 = ob_exports1[8] / 180
            Steer_L21 = ob_exports1[9] / 4
            Steer_R21 = ob_exports1[10] / 4
            Xcg_TM1 = ob_exports1[11] / 1000
            Ycg_TM1 = ob_exports1[12] / 300
            Zcg_TM1 = ob_exports1[13] / 45
            curv = ob_exports1[14]
            r_t = ob_exports1[15]
            done = ob_exports1[16]
            #            print('T_bar_Tq1=',T_bar_Tq1)
            print('r_t=', r_t)

            #            if abs(Lateral1) > 1 or abs(Yaw1) > 1 :
            if t_current > 20 or abs(Yaw1) > 1:

                break

            s_t1 = np.hstack(
                (T_bar_Tq1, LatG1, YawRate1, Yaw1, Lateral1, Steer_SW1,
                 StrAV_SW1, Steer_L11, Steer_R11, Steer_L21, Steer_R21,
                 Xcg_TM1, Ycg_TM1, Zcg_TM1, curv))
            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            #            print ("Rewards=",rewards)
            #            print ("Actions=",actions)
            #            print ("states=",states)
            #            print (states.shape)

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            #            print("rt1=",target_q_values)
            #            print(target_q_values.shape)

            for k in range(len(batch)):

                if dones[k]:

                    y_t[k] = rewards[k]

                else:

                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):

                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                #                print("a_for_grad=",a_for_grad)
                #                print(a_for_grad.shape)
                grads = critic.gradients(states, a_for_grad)
                #                print("grads=",grads)
                #                print(grads.shape)
                if step > 30:
                    grads_factor = gradient_inverter(critic_gradient,
                                                     a_t_inv,
                                                     p_min=-1,
                                                     p_max=1,
                                                     BATCH_SIZE=30)
                else:
                    grads_factor = 1
#                print("grads_factor=",grads_factor)
                grads_factor1 = np.asarray(grads_factor)
                grads3 = grads * grads_factor1
                #                print("grads3=",grads3)
                actor.train(states, grads3)
                actor.target_train()
                critic.target_train()

            total_reward += r_t

            s_t = s_t1

            print("Episode", i, "t_current", t_current, "Action", a_t,
                  "Reward", r_t, "Loss", loss, "step", step)

            step += 1

            if done:

                break
        #s.shutdown()

        if (train_indicator):

            print("Now we save model")
            actor.model.save_weights("actormodel.h5", overwrite=True)
            with open("actormodel.json", "w") as outfile:

                json.dump(actor.model.to_json(), outfile)

            critic.model.save_weights("criticmodel.h5", overwrite=True)
            with open("criticmodel.json", "w") as outfile:

                json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))

        print("")


#        s.close() # TCP/IP socket close

    s.close()  # TCP/IP socket close
    print("Finish.")
def train(train_indicator=1):
    env = Env()

    BUFFER_SIZE = 200000
    BATCH_SIZE = 128
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic

    action_dim = env.action_dim
    state_dim = env.observation_space()

    np.random.seed(1337)

    EXPLORE = 100000.
    episode_count = 100
    max_steps = 10000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)

    print("load model weight")
    try:
        actor.model.load_weights("model/actormodel.h5")
        critic.model.load_weights("model/criticmodel.h5")
        actor.target_model.load_weights("model/actormodel.h5")
        critic.target_model.load_weights("model/criticmodel.h5")
        print("load successfully")
    except:
        print("Cannot find the model weight")

    s_t = env.reset()

    for i in range(episode_count):
        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0

            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 10.0, 1, 7)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0, 1, 3)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]

            s_t1, r_t, _ = env.step(a_t[0])

            buff.add(s_t, a_t[0], r_t, s_t1, done)

            # env.get_memory(buff)

            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if train_indicator:
                print("save model model")
                actor.model.save_weights("model/actormodel.h5", overwrite=True)
                # actor.model.save_weights("model/actormodel.h5", overwrite=True)
                # with open("model/actormodel.json", "wb") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("model/criticmodel.h5",
                                          overwrite=True)
                # critic.model.save_weights("model/criticmodel.h5", overwrite=True)
                # with open("model/criticmodel.json", "wb") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    print("Finish.")
    return actor
Beispiel #10
0
def playGame(train_indicator=1):  # 1 means Train, 0 means simply Run

    cur_path = os.path.abspath(os.path.curdir)
    model_path = "/Models/"
    result_path = "/Results/"
    curr_test = "Large_Noise_Result/"
    actor_name = "actormodel{}.h5"
    critic_name = "criticmodel{}.h5"

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 1e-4  # Learning rate for Actor
    LRC = 1e-3  # Lerning rate for Critic
    action_dim = 4  # Steering/Acceleration/Brake
    state_dim = 131  # of sensors input

    np.random.seed(2333)

    EXPLORE = 10000
    episode_count = 10000
    max_steps = 100000
    reward = 0
    done = 0
    step = 0
    epsilon = 1

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    K.set_session(sess)

    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)

    buff = Buffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = Simulator()

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights(cur_path + "/Models/actormodel.h5")
        critic.model.load_weights(cur_path + "/Models/criticmodel.h5")
        actor.target_model.load_weights(cur_path + "/Models/actormodel.h5")
        critic.target_model.load_weights(cur_path + "/Models/criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    for i in range(episode_count):
        start_time = time.time()

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))
        if i % 1000 == 0:
            losses = np.zeros((1000, ))
            total_rewards = np.zeros((1000, ))

        s_t = env.reset()

        total_reward = 0
        loss = 0
        for j in range(max_steps):
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.5, 1.00, 0.15)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.15)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], 0.5, 1.00, 0.15)
            noise_t[0][3] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][3], 0.5, 1.00, 0.15)

            # The following code do the stochastic brake
            # if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            a_t[0][3] = a_t_original[0][3] + noise_t[0][3]

            a_t = np.around(a_t, decimals=1)

            s_t1, r_t, done = env.step(a_t)

            buff.add(s_t, a_t, r_t, np.array([[done]]),
                     s_t1)  # Add replay buffer

            # Do the batch update

            batch = buff.getBatch(BATCH_SIZE)
            states = batch[:, :state_dim]
            actions = batch[:, state_dim:state_dim + action_dim]
            rewards = batch[:, state_dim + action_dim]
            new_states = batch[:, state_dim + action_dim + 2:]
            dones = batch[:, state_dim + action_dim + 1]
            y_t = actions.copy()

            target_q_values = critic.target_model.predict([
                new_states,
                np.around(actor.target_model.predict(new_states), decimals=1)
            ])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = np.around(actor.model.predict(states), decimals=1)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        losses[i % 1000] = loss
        total_rewards[i % 1000] = total_reward

        if np.mod((i + 1), 100) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights(cur_path + "/Models/actormodel.h5",
                                         overwrite=True)
                critic.model.save_weights(cur_path + "/Models/criticmodel.h5",
                                          overwrite=True)

        if np.mod((i + 1), 1000) == 0:
            if (train_indicator):
                losses_path = (cur_path + result_path + curr_test +
                               'losses{}.txt').format(i)
                rewards_path = (cur_path + result_path + curr_test +
                                'rewards{}.txt').format(i)
                np.savetxt(losses_path, losses)
                np.savetxt(rewards_path, total_rewards)
                print("Now we save model")
                actor.model.save_weights((cur_path + model_path + curr_test +
                                          "actormodel{}.h5").format(i),
                                         overwrite=True)
                critic.model.save_weights((cur_path + model_path + curr_test +
                                           "criticmodel{}.h5").format(i),
                                          overwrite=True)
                actor.target_model.save_weights(
                    (cur_path + model_path + curr_test +
                     "actortarmodel{}.h5").format(i),
                    overwrite=True)
                critic.target_model.save_weights(
                    (cur_path + model_path + curr_test +
                     "crititarcmodel{}.h5").format(i),
                    overwrite=True)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("Took {} S".format(time.time() - start_time))

    # This is for shutting down TORCS
    print("Finish.")
Beispiel #11
0
def playGame(train_indicator=0):  # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 1000000
    # BUFFER_SIZE1 = 50000
    # BUFFER_SIZE2 = 5000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic

    action_dim = 2  # Acceleration/LaneChanging
    state_dim = 26  # of sensors input

    np.random.seed(1337)

    EXPLORE = 1000000
    episode_count = 2018
    max_steps = 5299
    done = 0
    step = 0
    epsilon = 1

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)
    # buff0 = ReplayBuffer(BUFFER_SIZE0)  # Create replay buffer
    # buff1 = ReplayBuffer(BUFFER_SIZE1)
    # buff2 = ReplayBuffer(BUFFER_SIZE2)
    # Now load the weight
    print("Now we load the weight")
    try:
        # actor.model.load_weights("train_actor_lanechanging.h5")
        actor.model.load_weights("actormodel.h5")
        actor.target_model.load_weights("actor_target_model.h5")
        print("actor Weight load successfully")
    except:
        print("Cannot find the actor weight")

    try:
        critic.model.load_weights("criticmodel.h5")
        critic.target_model.load_weights("critic_target_model.h5")
        print("critic Weight load successfully")
    except:
        print("Cannot find the critic weight")

    HOST = '127.0.0.1'
    PORT = 5099
    BUFSIZ = 1024
    ADDR = (HOST, PORT)
    socketserver.TCPServer.allow_reuse_address = True
    tcpSerSock = socket(AF_INET, SOCK_STREAM)
    tcpSerSock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
    tcpSerSock.bind(ADDR)
    tcpSerSock.listen(5)

    # while True:
    print('waiting for connection...')
    tcpCliSock, addr = tcpSerSock.accept()
    print('...connected from:', addr)

    # save Reward file
    with open("r_l_q_everyeposide.txt", "w") as f:
        print("Vissim Experiment Start.")
        for i in range(episode_count):
            display = []
            print("Episode : " + str(i) + " Replay Buffer " + str(buff.num_experiences))

            data0 = tcpCliSock.recv(BUFSIZ)
            Vx0 = struct.unpack("30d", data0)[0]
            Vy0 = struct.unpack("30d", data0)[1]
            Dl0 = struct.unpack("30d", data0)[2]
            Dr0 = struct.unpack("30d", data0)[3]
            Vx2_diff0 = struct.unpack("30d", data0)[4]
            Dx2_diff0 = struct.unpack("30d", data0)[5]
            Vy2_diff0 = struct.unpack("30d", data0)[6]
            Dy2_diff0 = struct.unpack("30d", data0)[7]
            Vx1_diff0 = struct.unpack("30d", data0)[8]
            Dx1_diff0 = struct.unpack("30d", data0)[9]
            Vy1_diff0 = struct.unpack("30d", data0)[10]
            Dy1_diff0 = struct.unpack("30d", data0)[11]
            Vx3_diff0 = struct.unpack("30d", data0)[12]
            Dx3_diff0 = struct.unpack("30d", data0)[13]
            Vy3_diff0 = struct.unpack("30d", data0)[14]
            Dy3_diff0 = struct.unpack("30d", data0)[15]
            Vx6_diff0 = struct.unpack("30d", data0)[16]
            Dx6_diff0 = struct.unpack("30d", data0)[17]
            Vy6_diff0 = struct.unpack("30d", data0)[18]
            Dy6_diff0 = struct.unpack("30d", data0)[19]
            Vx4_diff0 = struct.unpack("30d", data0)[20]
            Dx4_diff0 = struct.unpack("30d", data0)[21]
            Vy4_diff0 = struct.unpack("30d", data0)[22]
            Dy4_diff0 = struct.unpack("30d", data0)[23]
            Vx5_diff0 = struct.unpack("30d", data0)[24]
            Dx5_diff0 = struct.unpack("30d", data0)[25]
            Vy5_diff0 = struct.unpack("30d", data0)[26]
            Dy5_diff0 = struct.unpack("30d", data0)[27]
            done0 = struct.unpack("30d", data0)[28]
            aux0 = struct.unpack("30d", data0)[29]
            raw_obs0 = [Vx0, Vy0, Dl0, Dr0, Vx2_diff0, Dx2_diff0, Vy2_diff0, Dy2_diff0, Vx1_diff0, Dx1_diff0, Vy1_diff0,
                        Dy1_diff0, Vx3_diff0, Dx3_diff0, Vy3_diff0, Dy3_diff0, Vx6_diff0, Dx6_diff0, Vy6_diff0,
                        Dy6_diff0, Vx4_diff0, Dx4_diff0, Vy4_diff0, Dy4_diff0, Vx5_diff0, Dx5_diff0, Vy5_diff0,
                        Dy5_diff0]
            print('raw_obs0=', raw_obs0)

            # Generate a Vissim environment
            env = VissimEnv(raw_obs0)

            s_t = env.make_observaton(raw_obs0)

            total_loss = 0
            total_reward_cf = 0
            total_reward_lc = 0
            total_q_value = 0

            Dx2_diff = Dx2_diff0
            Vx = Vx0
            Vx2_diff = Vx2_diff0
            for j in range(max_steps):
                loss = 0
                epsilon -= 1.0 / EXPLORE
                a_t = np.zeros([1, action_dim])
                noise_t = np.zeros([1, action_dim])

                a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

                noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30)
                noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.02, 1.00, 0.10)

                a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
                a_t[0][1] = a_t_original[0][1] + noise_t[0][1]

                if a_t[0][1] > 0:
                    acceleration = a_t[0][1] * 3.5
                else:
                    acceleration = a_t[0][1] * 8

                r_t_first = 0
                # if Dx2_diff < 2*Vx + 4.25:
                # if acceleration > 0:
                # r_t_first = -1
                # acceleration = -acceleration
                if Dx2_diff < 2 * Vx + 4.25:
                    # if Vx2_diff > 0:
                    if acceleration < -abs(Vx2_diff) / 2:
                        pass
                    else:
                        acceleration = -abs(Vx2_diff) / 2
                        r_t_first = -1

                if 0 <= a_t[0][0] and a_t[0][0] <= 0.1739523314093953:
                    LaneChanging = 1
                elif a_t[0][0] > 0.1739523314093953 and a_t[0][0] <= 1 - 0.1739523314093953:
                    LaneChanging = 0
                else:
                    LaneChanging = 2

                # 1 represent left lane changing
                # 0 represent no lane changing
                # 2 represent right lane changing

                ACTION = [LaneChanging, acceleration]

                print("acceleration=", acceleration)
                print("LaneChanging=", LaneChanging)
                # while True:

                tcpCliSock.send(str(ACTION).encode())

                data = tcpCliSock.recv(BUFSIZ)
                # print(data)

                Vx = struct.unpack("30d", data)[0]
                Vy = struct.unpack("30d", data)[1]
                Dl = struct.unpack("30d", data)[2]
                Dr = struct.unpack("30d", data)[3]
                Vx2_diff = struct.unpack("30d", data)[4]
                Dx2_diff = struct.unpack("30d", data)[5]
                Vy2_diff = struct.unpack("30d", data)[6]
                Dy2_diff = struct.unpack("30d", data)[7]
                Vx1_diff = struct.unpack("30d", data)[8]
                Dx1_diff = struct.unpack("30d", data)[9]
                Vy1_diff = struct.unpack("30d", data)[10]
                Dy1_diff = struct.unpack("30d", data)[11]
                Vx3_diff = struct.unpack("30d", data)[12]
                Dx3_diff = struct.unpack("30d", data)[13]
                Vy3_diff = struct.unpack("30d", data)[14]
                Dy3_diff = struct.unpack("30d", data)[15]
                Vx6_diff = struct.unpack("30d", data)[16]
                Dx6_diff = struct.unpack("30d", data)[17]
                Vy6_diff = struct.unpack("30d", data)[18]
                Dy6_diff = struct.unpack("30d", data)[19]
                Vx4_diff = struct.unpack("30d", data)[20]
                Dx4_diff = struct.unpack("30d", data)[21]
                Vy4_diff = struct.unpack("30d", data)[22]
                Dy4_diff = struct.unpack("30d", data)[23]
                Vx5_diff = struct.unpack("30d", data)[24]
                Dx5_diff = struct.unpack("30d", data)[25]
                Vy5_diff = struct.unpack("30d", data)[26]
                Dy5_diff = struct.unpack("30d", data)[27]
                done = struct.unpack("30d", data)[28]
                aux = struct.unpack("30d", data)[29]
                raw_obs = [Vx, Vy, Dl, Dr, Vx2_diff, Dx2_diff, Vy2_diff, Dy2_diff, Vx1_diff, Dx1_diff, Vy1_diff,
                           Dy1_diff, Vx3_diff, Dx3_diff, Vy3_diff, Dy3_diff, Vx6_diff, Dx6_diff, Vy6_diff, Dy6_diff,
                           Vx4_diff, Dx4_diff, Vy4_diff, Dy4_diff, Vx5_diff, Dx5_diff, Vy5_diff, Dy5_diff]

                print('vel=', Vx)
                print('vel_diff=', Vx2_diff)
                print('d=', Dx2_diff)
                print('done=', done)

                if raw_obs == []:
                    print('No data')
                    break

                if LaneChanging == 1 or LaneChanging == 2:
                    r_t_lanechange = aux
                    if aux == -0.8:
                        if r_t_first == 0:
                            r_t_follow = env.step(acceleration, raw_obs)
                        else:
                            r_t_follow = r_t_first
                    else:
                        r_t_follow = 0
                elif LaneChanging == 0:
                    if r_t_first == 0:
                        r_t_follow = env.step(acceleration, raw_obs)
                    else:
                        r_t_follow = r_t_first
                    r_t_lanechange = 0

                if i == 0 and j == 0:
                    r_t_lanechange, r_t_follow = 0, 0

                print('r_t_follow=', r_t_follow, 'r_t_lanechange=', r_t_lanechange)

                # save some variables for display
                display.append([i, j, Vx, Vx2_diff, r_t_follow + r_t_lanechange])

                r_t = [r_t_follow, r_t_lanechange]

                s_t1 = env.make_observaton(raw_obs)

                q_value = critic.model.predict_on_batch(
                    [np.array(s_t).reshape(1, 26), np.array(a_t_original).reshape(1, 2)])
                target_q_value = critic.target_model.predict_on_batch(
                    [np.array(s_t).reshape(1, 26), np.array(a_t_original).reshape(1, 2)])
                # f.write("Episode" + str(i) + " " + "Step" + str(j) + " " + "Action=" + str(ACTION) + " " + "aIDM=" + str(aIDM) + "\n")
                error = abs(r_t + GAMMA * target_q_value - q_value)
                error = np.mean(error)
                # Add replay buffer
                buff.add(s_t, a_t[0], r_t, s_t1, done)

                batch = buff.getBatch(BATCH_SIZE)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.asarray([e[2] for e in batch])

                target_q_values = critic.target_model.predict([new_states, actor.target_model.predict([new_states])])

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                if (train_indicator):
                    loss += critic.model.train_on_batch([states, actions], y_t)
                    a_for_grad = actor.model.predict(states)
                    grads = critic.gradients(states, a_for_grad)
                    actor.train(states, grads)
                    actor.target_train()
                    critic.target_train()

                total_reward_cf += r_t_follow
                total_reward_lc += r_t_lanechange
                total_loss += loss
                total_q_value += q_value

                s_t = s_t1

                print("Episode", i, "Step", j, "Total Step", step, "acceleration=", acceleration, "LaneChanging=",
                      LaneChanging, "Reward", r_t, "Loss", loss)

                step += 1

                if done == 1:
                    break
            display = np.array(display)
            np.savetxt('epi'+str(i)+'.txt', display)

            if np.mod(i, 5) == 0:
                if (train_indicator):
                    print("Now we save model")
                    actor.model.save_weights("actormodel.h5", overwrite=True)
                    with open("actormodel.json", "w") as outfile:
                        json.dump(actor.model.to_json(), outfile)

                    critic.model.save_weights("criticmodel.h5", overwrite=True)
                    with open("criticmodel.json", "w") as outfile:
                        json.dump(critic.model.to_json(), outfile)

                    critic.target_model.save_weights("critic_target_model.h5", overwrite=True)
                    with open("critic_target_model.json", "w") as outfile:
                        json.dump(critic.target_model.to_json(), outfile)

                    actor.model.save_weights("actor_target_model.h5", overwrite=True)
                    with open("actor_target_model.json", "w") as outfile:
                        json.dump(actor.target_model.to_json(), outfile)

            ave_loss = total_loss / (j + 1)
            ave_q = total_q_value / (j + 1)

            f.write("Episode" + str(i) + " " + "TotalReward_follow=" + str(
                total_reward_cf) + " " + "TotalReward_lanechange=" + str(total_reward_lc) + " " + "AverageLoss=" + str(
                ave_loss) + " " + "AverageValue=" + str(ave_q) + "\n")

        print("TOTAL REWARD @ " + str(j) + "/" + str(i) + "-th Episode  : Reward_follow " + str(
            total_reward_cf) + "Reward_follow :" + str(total_reward_lc))
        print("Total Step: " + str(step))
        print("")

        tcpCliSock.close()
        tcpSerSock.close()
        # env.end()  # This is for shutting down TORCS
        print("Finish.")
#!/usr/bin/env python
# coding=utf-8
import numpy as np
import tensorflow as tf
from OU import OU
from ReplayBuffer import ReplayBuffer

from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork

from env_step import Env

OU = OU()


def train(train_indicator=1):
    env = Env()

    BUFFER_SIZE = 200000
    BATCH_SIZE = 128
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic

    action_dim = env.action_dim
    state_dim = env.observation_space()

    np.random.seed(1337)

    EXPLORE = 100000.
Beispiel #13
0
def playGame(train_indicator=1):  # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic

    action_dim = 3  # Steering/Acceleration/Brake
    state_dim = 29  # of sensors input

    np.random.seed(61502)

    base_dir = "/home/sergio/Projects/apclypsr/DDPG-Keras-Torcs/"

    vision = True

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 10000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    esar2 = []
    esar4 = []

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    #tf.set_random_seed(61502)

    actor = ActorNetwork(sess, state_dim, action_dim,
                         LRA, TAU, BATCH_SIZE)

    critic = CriticNetwork(sess, state_dim, action_dim,
                           LRC, TAU, GAMMA,
                           actor.get_num_trainable_vars())





    #actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    #critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Now load the weight

    restore = False
    if restore:
        print("Now we load the weight")
        # tf.reset_default_graph()

        # Tensorflow saver
        saver = tf.train.Saver()
        try:
            saver.restore(sess, base_dir + "ddpg.ckpt")
            print("model restored")
        except:
            print("Cannot find the weight")
    else:
        print("No weight loaded")
        init = tf.global_variables_initializer()
        sess.run(init)
        # Tensorflow saver
        saver = tf.train.Saver()

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 500) == 0:
            ob = env.reset(relaunch=True)  # relaunch TORCS every 500 episode because of the memory leak error
        else:
            ob = env.reset()

        # 0. BUILD THE 4 images.
        s_t = np.hstack((ob.img))
        s_t_four_images_list = []
        for j in range(4):
            s_t_four_images_list.append(np.zeros((128, 128), dtype=np.float64))
        s_t_phi = get_phi_from_four_images(s_t_four_images_list)


        ep_ave_max_q = 0
        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])

            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.predict(s_t_phi)
            # print("a_t_original")
            print(a_t_original)
            # print(a_t_original.shape)
            # print(a_t_original[0,1])
            # print(a_t_original[0][1])

            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            # The following code do the stochastic brake
            # if random.random() <= 0.05:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            # 0. UPDATE THE LAST FOUR IMAGES
            s_t1 = np.hstack((ob.img))
            if len(s_t_four_images_list) >= 4:
                s_t_four_images_list.pop(0)
                image = np.reshape(ob.img, (128, 128))
                s_t_four_images_list.append(image)

                # print greyscale image
                # plt.imshow(image, origin='lower')
                # plt.draw()
                # plt.pause(0.001)
            # get phi for the new observed state
            s_t1_phi = get_phi_from_four_images(s_t_four_images_list)

            buff.add(s_t_phi, a_t[0], r_t, s_t1_phi, done)  # Add replay buffer

            # Do the batch update
            if buff.size() > BATCH_SIZE:
                batch = buff.getBatch(BATCH_SIZE)
                states = np.asarray([e[0] for e in batch])
                states = np.concatenate(states, axis=0)
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                new_states = np.concatenate(new_states, axis=0)
                dones = np.asarray([e[4] for e in batch])
                y_t = np.asarray([e[1] for e in batch])

                actor_predicted_actions = actor.predict_target(new_states)
                #print("Actor predicted actions: ", actor_predicted_actions.shape)
                #print("New states: ", new_states.shape)

                target_q_values = critic.predict_target(new_states, actor_predicted_actions)

                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]

                if (train_indicator):
                    # loss += critic.model.train_on_batch([states, actions], y_t)
                    # a_for_grad = actor.model.predict(states)
                    # grads = critic.gradients(states, a_for_grad)
                    # actor.train(states, grads)
                    # actor.target_train()
                    # critic.target_train()

                    # Update the critic given the targets

                    print("y_t")
                    print(y_t.shape)

                    predicted_q_value, _, loss, loss2 = critic.train(states, actions, y_t)

                    print("LOSS:", loss)
                    print("LOSS2:", loss2)

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(states)
                    grads = critic.action_gradients(states, a_outs)
                    actor.train(states, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                #batch update

            #step end
            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
            esar = (i, step, a_t, r_t, loss)
            esar2.append(esar)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                # print("Now we save model")
                # actor.model.save_weights("actormodel2.h5", overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)
                #
                # critic.model.save_weights("criticmodel2.h5", overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)
                save_path = saver.save(sess, base_dir + "ddpg.ckpt")
                print("Model saved in file: %s" % save_path)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

        esar3 = (i, step, total_reward)
        esar4.append(esar3)

        def save_object(obj, filename):
            with open(filename, 'wb') as output:
                pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

        save_object(esar2, 'IntraEpisode.pkl')
        save_object(esar4, 'InterEpisode.pkl')



    env.end()  # This is for shutting down TORCS
    print("Finish.")
    print("Saving esars.")
Beispiel #14
0
import argparse
from gym import wrappers
from keras.models import model_from_json, Model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import Adam
import tensorflow as tf
#from keras.engine.training import collect_trainable_weights
from keras import backend as K
from mmstore import MMStore
from mujoco_actor_nn import ActorNetwork
from mujoco_critic_nn import CriticNetwork
from OU import OU
import time

noise_func = OU()
MAX_INTERACTION = 10000000
MAX_EPI_INT = 500
MEM_SIZE = 500
BATCH_SIZE = 64
gamma = 0.99
SOFT_UPDATE = 1e-3
ALR = 1e-4    
CLR = 1e-3
REPEAT = 10


env = gym.make('HalfCheetah-v1')
eval_state = env.reset()
output_shape=env.action_space.shape
input_shape=env.observation_space.shape     
Beispiel #15
0
from gym_torcs import TorcsEnv
import numpy as np
import random
import argparse
import tensorflow as tf
import json

from ReplayBuffer import ReplayBuffer
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from OU import OU
import timeit

OU = OU()       #Ornstein-Uhlenbeck Process

def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
Beispiel #16
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1,action_dim])
            noise_t = np.zeros([1,action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 30) == 0:
                print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
        
            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #17
0
    vnc_kwargs={
        'encoding': 'tight',
        'fine_quality_level': 0,
        'subsample_level': 3,
        'quality_level': 0,
    },
)

# show VNC window?
render = False

### SET UP AGENT

train_indicator = 1  # 1 if training

OU = OU()  # Ornstein-Uhlenbeck Process
BUFFER_SIZE = 100000
BATCH_SIZE = 32
GAMMA = 0.99
TAU = 0.1  # Target Network HyperParameter
LRA = 0.0001  # Learning rate for Actor
LRC = 0.001  # Lerning rate for Critic

av_pos = -10  # todo
av_xpos = 2  # todo
av_angle = np.pi / 2
dist_t0, area_t0 = current_state(av_pos)

l1 = 8
l2 = 8
l3 = 8
Beispiel #18
0
class MainTrain(object):
    buffer_size = 100000
    batch_size = 100
    gamma = 0.99
    tau = 0.0001  # Target Network HyperParameters
    LRA = 0.001  # Learning rate for Actor
    LRC = 0.001  # Learning rate for Critic
    explore_iter = 100000.
    episode_count = 20000
    max_steps = 2000
    action_dim = 4  # Steering/Acceleration/Brake
    parameter_acc_dim = 2
    parameter_time_dim = action_dim
    action_size = action_dim + parameter_acc_dim + parameter_time_dim

    def __init__(self):
        self.OU = OU()

        self.total_correct = 0
        self.total_wrong = 0
        self.accuracy_all = []
        self.if_done = False
        self.epsilon = 1
        self.total_reward = None
        self.loss = None

        self.sim_inter = UpdateInter()
        self.state_t = []
        self.state_dim = self.sim_inter.state_dim
        self.action_t = []
        self.action_acc = None
        self.action_time = None
        self.Tau = self.sim_inter.Tau

        self.actor = None
        self.critic = None
        self.buff = None

        self.batch = None
        self.states = None
        self.actions = None
        self.rewards = None
        self.new_states = None
        self.if_dones = None
        self.y_t = None

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        K.set_session(self.sess)

    def load_weights(self):
        print("Now we load the weight")
        try:
            self.actor.model.load_weights("actormodel.h5")
            self.critic.model.load_weights("criticmodel.h5")
            self.actor.target_model.load_weights("actormodel.h5")
            self.critic.target_model.load_weights("criticmodel.h5")
            print("Weight load successfully")
        except:
            print("Cannot find the weight")

    def update_weights(self):
        self.actor.model.save_weights("actormodel.h5", overwrite=True)
        with open("actormodel.json", "w") as outfile:
            json.dump(self.actor.model.to_json(), outfile)
        self.critic.model.save_weights("criticmodel.h5", overwrite=True)
        with open("criticmodel.json", "w") as outfile:
            json.dump(self.critic.model.to_json(), outfile)

    def update_batch(self):
        self.batch = self.buff.getBatch(self.batch_size)
        self.states = np.squeeze(np.asarray([e[0] for e in self.batch]), axis=1)
        self.actions = np.asarray([e[1] for e in self.batch])
        self.rewards = np.asarray([e[2] for e in self.batch])
        self.new_states = np.squeeze(np.asarray([e[3] for e in self.batch]), axis=1)
        self.if_dones = np.asarray([e[4] for e in self.batch])
        self.y_t = np.asarray([e[2] for e in self.batch])
        target_q_values = self.critic.target_model.predict(
            [self.new_states, self.actor.target_model.predict(self.new_states)])
        for k, done in enumerate(self.if_dones):
            self.y_t[k] = self.rewards[k] if done else self.rewards[k] + self.gamma * target_q_values[k]

    def update_loss(self):
        self.loss += self.critic.model.train_on_batch([self.states, self.actions], self.y_t)
        a_for_grad = self.actor.model.predict(self.states)
        grads = self.critic.gradients(self.states, a_for_grad)
        self.actor.train(self.states, grads)
        self.actor.target_train()
        self.critic.target_train()

    def action_noise(self, train_indicator):
        self.epsilon -= 1.0 / self.explore_iter
        noise_t = np.zeros([1, self.action_size])
        action_t_original = self.actor.model.predict(self.state_t)
        print("Action ", action_t_original)
        for i in range(self.action_dim):
            noise_t[0][i] = train_indicator * max(self.epsilon, 0) * \
                            self.OU.function(action_t_original[0][i], 0.00, 0.10, 0.20)
        noise_t[0][4] = train_indicator * max(self.epsilon, 0) * \
                        self.OU.function(action_t_original[0][4], -0.05, self.sim_inter.Max_Acc, 1.00)
        noise_t[0][5] = train_indicator * max(self.epsilon, 0) * \
                        self.OU.function(action_t_original[0][5], 0.05, - self.sim_inter.Max_Acc, 1.00)
        for i in range(self.parameter_time_dim):
            noise_t[0][i + self.action_dim + self.parameter_acc_dim] = \
                train_indicator * max(self.epsilon, 0) * \
                self.OU.function(action_t_original[0][i + self.action_dim + self.parameter_acc_dim], 0.01, 0.50, 0.10)
        action = np.zeros([1, self.action_size])
        for i in range(self.action_size):
            action[0][i] = action_t_original[0][i] + noise_t[0][i]
        return action

    def update_action(self, action, train_indicator, e):
        if action == 0:
            process = 'Approach Process'
            self.action_acc = self.action_t[0][4]
            self.action_time = self.action_t[0][6]
        elif action == 1:
            process = 'Observe Process'
            self.action_time = self.action_t[0][7]
        elif action == 2:
            process = 'Wait Process'
            self.action_time = self.action_t[0][8]
        else:
            process = 'Traverse Process'
            self.action_acc = self.action_t[0][5]
            self.action_time = self.action_t[0][9]
        time_step = int(np.ceil(max(self.action_time / self.Tau, 1.0)))

        collision = False
        if_pass = False
        for ts in range(time_step):
            old_av_y = self.sim_inter.av_y
            old_av_velocity = self.sim_inter.av_velocity
            if action == 1:
                self.action_acc = (self.sim_inter.observe_vel - self.sim_inter.av_velocity) / self.sim_inter.Tau
            elif action == 2:
                self.action_acc = (- self.sim_inter.av_velocity) / self.sim_inter.Tau
            reward, collision = self.sim_inter.reward_function(self.action_acc)
            state_t1 = self.sim_inter.update_vehicle(self.action_acc)

            self.buff.add(self.state_t, self.action_t[0], reward, state_t1, self.if_done)
            self.update_batch()

            if train_indicator:
                self.update_loss()
            self.total_reward += reward
            print process, " (", self.action_acc, ", ", self.action_time, ") ", "AV = ", old_av_y, \
                "Velocity = ", old_av_velocity, "Episode", e, "Reward", reward, "Loss", self.loss

            if action == 1 and self.state_t[0][0] <= 0:
                self.state_t = state_t1
                break
            if old_av_y > self.sim_inter.Pass_Point or collision > 0:
                if_pass = old_av_y > self.sim_inter.Pass_Point
                self.if_done = True
                break
            self.state_t = state_t1
        return collision, if_pass

    def launch_train(self, train_indicator=1):  # 1 means Train, 0 means simply Run
        print 'Launch Training Process'
        np.random.seed(1337)

        self.state_t = self.sim_inter.get_state()
        self.state_dim = self.sim_inter.state_dim
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRA)
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRC)
        self.buff = ReplayBuffer(self.buffer_size)
        self.load_weights()

        for e in range(self.episode_count):
            print("Episode : " + str(e) + " Replay Buffer " + str(self.buff.count()))

            for j in range(self.max_steps):
                self.loss = 0
                self.total_reward = 0
                self.action_t = self.action_noise(train_indicator)
                choose_action = np.argmax(self.action_t[0][0:4])
                collision, if_pass = self.update_action(choose_action, train_indicator, e)

                if self.if_done:
                    self.sim_inter = UpdateInter()
                    self.state_t = self.sim_inter.get_state()
                    self.if_done = False
                    break

            if train_indicator:
                self.update_weights()

            self.total_correct += int(collision <= 0 and if_pass)
            self.total_wrong += int(collision > 0)
            accuracy = 0
            if self.total_correct + self.total_wrong:
                accuracy = self.total_correct / (self.total_correct + self.total_wrong)

            if np.mod(e, 100) == 0:
                self.accuracy_all.append(accuracy)
                self.total_correct = 0
                self.total_wrong = 0

            print("TOTAL REWARD @ " + str(e) + "-th Episode  : Reward " + str(self.total_reward) +
                  " Collision " + str(collision > 0) + " Accuracy " + str(accuracy) +
                  " All Accuracy " + str(self.accuracy_all))
            print("")
        print("Finish.")
Beispiel #19
0
def playGame(train_indicator=1,
             safety_constrain_flag=False):  #1 means Train, 0 means simply Run
    #initialization = 0
    episode_trained = 0
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.9999
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 2  #Steering/Acceleration/Brake
    state_dim = 29 + 36  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 1000
    max_steps = 300
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    indicator = 0

    plt.ion()

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load the weight")

    try:
        actor.model.load_weights("actormodel_following.h5")
        critic.model.load_weights("criticmodel_following.h5")
        actor.target_model.load_weights("actormodel_following.h5")
        critic.target_model.load_weights("criticmodel_following.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    cumreward_list = []
    average_step_reward_list = []
    damage_rate_list = []
    epsilon_list = []
    results_list = []
    trackPos_list = []
    speed_list = []
    epreward_list = []
    damage_time = []

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))
        print("Epsilon is: ", epsilon)
        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))
        epsilon = epsilon * 0.998
        total_reward = 0.
        damage_steps = 0
        for j in range(max_steps):
            loss = 0
            damage = 0
            #epsilon -= 1 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            if train_indicator:
                a_t_original = actor.target_model.predict(
                    s_t.reshape(1, s_t.shape[0]))
            else:
                a_t_original = actor.target_model.predict(
                    s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0.1) * OU.function2(
                a_t_original[0][0], 0.5, 0.90, 0.2)
            #noise_t[0][1] = train_indicator * max(epsilon, 0.0) * OU.function(a_t_original[0][1],  1.0 , 1.00, 0.10)
            noise_t[0][1] = train_indicator * max(epsilon, 0.1) * OU.function1(
                a_t_original[0][1], 0.9, 1.0, 0.60)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)
            '''
            if np.random.randn() < max(epsilon,0.05):
                a_t[0][0] = np.random.randn()*2-1
            else:
                a_t[0][0] = a_t_original[0][0]
            '''

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]

            a_t_primitive = Get_actions(a_t[0][0],
                                        a_t[0][1],
                                        ob,
                                        safety_constrain=safety_constrain_flag)

            ob, r_t, done, info = env.step(a_t_primitive)

            if r_t == -5.0 or r_t == -1.0:
                damage_steps += 1
                damage = 1

            trackPos_list.append(ob.trackPos)
            speed_list.append(ob.speedX)
            epreward_list.append(r_t)
            damage_time.append(damage)

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm, ob.opponents))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        damage_rate = (float)(damage_steps / j * 100)

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_following.h5",
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_following.h5",
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        if train_indicator:
            # Save the results
            cumreward_list.append(total_reward)
            average_step_reward_list.append(total_reward / j)
            damage_rate_list.append(damage_rate)
            epsilon_list.append(epsilon)
            sio.savemat(
                'results_overtaking.mat', {
                    'total_reward': cumreward_list,
                    'average_reward': average_step_reward_list,
                    'epsilon': epsilon_list,
                    'damage': damage_rate_list
                })
        else:
            sio.savemat(
                'info.mat', {
                    'ep_reward': epreward_list,
                    'trackPos': trackPos_list,
                    'speed': speed_list,
                    'damage_rate': damage_rate,
                    'damage_time': damage_time
                })
            print('damage rate is:', damage_rate)

        plt.figure(1)
        plt.hold(True)
        plt.subplot(511)
        plt.plot(i, total_reward, 'ro')
        plt.xlabel("Episodie")
        plt.ylabel("Episodic total reward")
        plt.subplot(512)
        plt.plot(i, total_reward / j, 'bo')
        plt.xlabel("Episodie")
        plt.ylabel("Expected reward each step")
        plt.subplot(513)
        plt.plot(i, damage_rate, 'go')
        plt.xlabel("Episodie")
        plt.ylabel("Damage rate per episode [%]")
        plt.subplot(514)
        plt.plot(i, max(epsilon, 0.1), 'yo')
        plt.xlabel("Episodie")
        plt.ylabel("epsilon")
        plt.subplot(515)
        plt.plot(i, loss / j, 'yo')
        plt.xlabel("Episodie")
        plt.ylabel("Average loss")
        plt.draw()
        plt.show()
        plt.pause(0.001)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    plt.savefig('test.png')
    print("Finish.")
Beispiel #20
0
def playGame(train=0):  #1 means Train, 0 means simply Run
    load_from = "."

    save_to = os.path.join("data", "saved")
    save_thresh = 100000  # Save if total reward for the episode is more

    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    ou = OU().function  #Ornstein-Uhlenbeck Process
    buff = ReplayBuffer(BUFFER_SIZE)

    env = TorcsEnv(vision=False, throttle=True, gear_change=False)

    def state(ob):
        return np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ,
             ob.wheelSpinVel / 100.0, ob.rpm))

    def load_weights(dir):
        print("Loading weights from ", dir)
        try:
            actor.model.load_weights(os.path.join(dir, "actormodel.h5"))
            critic.model.load_weights(os.path.join(dir, "criticmodel.h5"))
            actor.target_model.load_weights(os.path.join(dir, "actormodel.h5"))
            critic.target_model.load_weights(
                os.path.join(dir, "criticmodel.h5"))
            print("Weight load successfully")
        except:
            print("Cannot find the weight")

    def save_weights(dir):
        if not os.path.exists(dir):
            os.makedirs(dir)

        print("Saving weights in ", dir)
        actor.model.save_weights(os.path.join(dir, "actormodel.h5"),
                                 overwrite=True)
        critic.model.save_weights(os.path.join(dir, "criticmodel.h5"),
                                  overwrite=True)

        with open(os.path.join(dir, "actormodel.json"), "w") as outfile:
            json.dump(actor.model.to_json(), outfile)

        with open(os.path.join(dir, "criticmodel.json"), "w") as outfile:
            json.dump(critic.model.to_json(), outfile)

    load_weights(load_from)
    # Generate a Torcs environment

    print("TORCS Experiment Start.")
    np.random.seed(1337)

    done = False
    step = 0
    epsilon = 1

    for episode in range(episode_count):

        print("Episode : " + str(episode) + " Replay Buffer " +
              str(buff.count()))

        ob = env.reset()
        s_t = state(ob)

        total_reward = 0.

        progress = tqdm.trange(max_steps, disable=not train)
        for _ in progress:
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train * max(epsilon, 0) * ou(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train * max(epsilon, 0) * ou(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train * max(epsilon, 0) * ou(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            s_t1 = state(ob)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])

            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)

                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.update_target()
                critic.update_target()

            total_reward += r_t
            s_t = s_t1

            progress.set_description("Episode %4i, TR %6.0f, loss %7.0f" %
                                     (episode, total_reward, loss))
            #print("Episode", i, "Step", step, "Action", [ "%.3f" % x for x in a_t[0]], "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

        #print("Episode %i, TOTAL REWARD %.0f" % (episode, total_reward))

        if train and total_reward > save_thresh:
            save_weights(save_to + str(episode))
            save_thresh = min(1000000, 2 * save_thresh)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #21
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.getBatch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def saveNetwork(self):
        self.saver.save(self.sess,
                        save_location + self.env_name + 'network' + '-ddpg',
                        global_step=self.time_step)

    def action(self, state):
        action = self.actor_network.action(state)
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)
        #print "Action:", action
        return action

    def noise_action(self, state, epsilon):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        #print action.shape
        #print "Action_No_Noise:", action
        noise_t = np.zeros(self.action_dim)
        noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80)
        noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10)
        noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05)

        if random.random() <= 0.01:  # 0.1
            print("********Stochastic brake***********")
            noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10)

        action = action + noise_t
        action[0] = np.clip(action[0], -1, 1)
        action[1] = np.clip(action[1], 0, 1)
        action[2] = np.clip(action[2], 0, 1)

        #print "Action_Noise:", action
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer

        if (not (math.isnan(reward))):
            self.replay_buffer.add(state, action, reward, next_state, done)

        self.time_step = self.time_step + 1
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()
def playGame(checkpoints=None,
             train_indicator=1,
             eps=1.0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 40000
    BATCH_SIZE = 16
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.01  #Learning rate for Actor
    LRC = 0.05  #Lerning rate for Critic

    vision = True
    action_dim = 3  #Steering/Acceleration/Brake

    if vision:
        state_dim = (64, 64, 3)  #of sensors input
    else:
        state_dim = 29
    np.random.seed(1337)

    EXPLORE = 1000000.
    episode_count = 2000
    max_steps = 8000000
    reward = 0
    done = False
    step = 0
    epsilon = eps
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def)
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA,
                         vision, summary_writer)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC,
                           vision)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer
    history = History()

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    log_file = open('train_log.log', 'w')
    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints))
        actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.target_model.load_weights(
            "criticmodel_{}.h5".format(checkpoints))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    max_reward = 0
    min_reward = 0

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        if vision:
            history.fill((ob.img))
            s_t = history.get()
        else:
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        total_damage = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            if vision:
                a_t_original = actor.model.predict(
                    s_t.reshape((-1, ) + state_dim))
            else:
                a_t_original = actor.model.predict(s_t.reshape(
                    1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.30, 0.30)
            noise_t[0][1] = 0.1 + train_indicator * max(
                epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            damage = ob.damage

            if vision:
                last_s_t = history.get().copy()
                history.add((ob.img))
                next_s_t = history.get().copy()
                if np.mod(step, 4) == 0:
                    buff.add(last_s_t, a_t[0], r_t, next_s_t,
                             done)  #Add replay buffer
                s_t1 = history.get()
            else:
                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
                buff.add(s_t, a_t[0], r_t, s_t1, done)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            if vision:
                target_q_values = critic.target_model.predict([
                    new_states.reshape((-1, ) + state_dim),
                    actor.target_model.predict(new_states).reshape(
                        (-1, ) + (action_dim, ))
                ])
            else:
                target_q_values = critic.target_model.predict(
                    [new_states,
                     actor.target_model.predict(new_states)])
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and buff.count() >= 1000:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.target_train()
                critic.target_train()

            total_reward += r_t
            total_damage += damage
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_{}.h5".format(i),
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_{}.h5".format(i),
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        max_reward = max(max_reward, total_reward)
        min_reward = min(min_reward, total_reward)
        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward) + "  EPS " + str(epsilon))
        print("Total Step: " + str(step) + ' Max: ' + str(max_reward) +
              ' Min: ' + str(min_reward))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #23
0
import argparse

from keras.models import Model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.optimizers import Adam
import tensorflow as tf
#from keras.engine.training import collect_trainable_weights
from keras import backend as K
from mmstore import MMStore
from mujoco_actor_nn import ActorNetwork
from mujoco_critic_nn import CriticNetwork
from OU import OU
import time

noise_func = OU()
MAX_INTERACTION = 10000000
MAX_EPI_INT = 100
MEM_SIZE = 500000
BATCH_SIZE = 64
REPETITION_NUM = 3
gamma = 0.99
SOFT_UPDATE = 1e-3
ALR = 1e-4
CLR = 1e-3

train_flag = False
train_int_cnt = 0
epi_flag = False
epi_int_cnt = 0
epi_cnt = 0
Beispiel #24
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")

    attacks = []
    for i in range(-10, 0):
        val = i / 10.0
        attacks.append([77, val])
    # for i in range(45, 55):
    #     attacks.append([i, -1.5])
    #     attacks.append([i, 1.5])
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        #     ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        # else:
        #     ob = env.reset()
        ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # if j == 71:
            #     print("cp attack!")
            #     if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            if (j == attacks[i][0]):
                print('cp attack on {} with {}'.format(attacks[i][0],
                                                       attacks[i][1]))
                a_t[0][0] = attacks[i][1]
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))
            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            cur_step_sample = [
                s_t.tolist(), a_t[0].tolist(), r_t,
                s_t1.tolist(), done
            ]
            cur_sample.append(cur_step_sample)

            # #Do the batch update
            # batch = buff.getBatch(BATCH_SIZE)
            # states = np.asarray([e[0] for e in batch])
            # actions = np.asarray([e[1] for e in batch])
            # rewards = np.asarray([e[2] for e in batch])
            # new_states = np.asarray([e[3] for e in batch])
            # dones = np.asarray([e[4] for e in batch])
            # y_t = np.asarray([e[1] for e in batch])

            # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            # for k in range(len(batch)):
            #     if dones[k]:
            #         y_t[k] = rewards[k]
            #     else:
            #         y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            # if (train_indicator):
            #     loss += critic.model.train_on_batch([states,actions], y_t)
            #     a_for_grad = actor.model.predict(states)
            #     grads = critic.gradients(states, a_for_grad)
            #     actor.train(states, grads)
            #     actor.target_train()
            #     critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 200:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0],
                                          attacks[i][1])
        with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        # overall_scores.append(total_reward)
        # plt.clf()
        # plt.plot(overall_scores)
        # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        #     pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #25
0
def playGame(train_indicator=1):    # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000  # 缓存能力,网络储存能力
    BATCH_SIZE = 32  # 批尺寸,一次处理样本数
    GAMMA = 0.99  # 折扣系数
    TAU = 0.001     # Target Network HyperParameters 目标网络超系数
    LRA = 0.0001    # Learning rate for Actor Actor网络学习率
    LRC = 0.001     # Lerning rate for Critic Critic网络学习率

    action_dim = 3  # Steering/Acceleration/Brake 加速/转向/刹车
    state_dim = 29  # of sensors input 29个传感器输入

    np.random.seed(1337)  # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU 管理策略,此处使用动态内存申请策略
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # 硬性限制GPU使用率为0.4
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    # Create replay buffer

    #  Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    theTime = datetime.datetime.now()  # 获取系统当前时间
    theTime = theTime.strftime('%y-%m-%d_%H:%M:%S')  # 转换为字符串形式作为CSV文件头
    folder_path = "practise_progress/" + theTime + "/"  # 只适用于Linux系统
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("folder created")
    else:
        print("folder existed")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.

        csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv"
        fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ",
                      "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"]
        csvFile = open(csvfileHeader, "w")
        writer = csv.writer(csvFile)
        writer.writerow(fileHeader)

        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            # The following code do the stochastic brake
            # if random.random() <= 0.1:
            #     print("********Now we apply the brake***********")
            #     noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      # Add replay buffer
            
            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300,
                       a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss]
            """        参数记录
                       轮次  步骤计数  车辆位置  X轴速度  Y轴速度  Z轴速度
                       加速输出  转向输出  刹车输出  回报  损失函"""
            writer.writerow(csvData)
            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
            step += 1
            if done:
                csvFile.close()
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)



        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  #  This is for shutting down TORCS
    print("Finish.")
Beispiel #26
0
def playGame(DDPG_config,
             train_indicator=1):  #1 means Train, 0 means simply Run
    # SETUP STARTS HERE
    if train_indicator > 0:
        folder = setup_run(DDPG_config)
    elif train_indicator == 0:
        folder = DDPG_config['EXPERIMENT']

    if DDPG_config['RSEED'] == 0:
        DDPG_config['RSEED'] = None
    np.random.seed(DDPG_config['RSEED'])

    ACTIVE_NODES = DDPG_config['ACTIVE_NODES']

    # Generate an environment
    if DDPG_config['ENV'] == 'balancing':
        env = OmnetBalancerEnv(DDPG_config, folder)
    elif DDPG_config['ENV'] == 'label':
        env = OmnetLinkweightEnv(DDPG_config, folder)

    action_dim, state_dim = env.a_dim, env.s_dim

    MU = DDPG_config['MU']
    THETA = DDPG_config['THETA']
    SIGMA = DDPG_config['SIGMA']

    ou = OU(action_dim, MU, THETA, SIGMA)  #Ornstein-Uhlenbeck Process

    BUFFER_SIZE = DDPG_config['BUFFER_SIZE']
    BATCH_SIZE = DDPG_config['BATCH_SIZE']
    GAMMA = DDPG_config['GAMMA']
    EXPLORE = DDPG_config['EXPLORE']
    EPISODE_COUNT = DDPG_config['EPISODE_COUNT']
    MAX_STEPS = DDPG_config['MAX_STEPS']
    if EXPLORE <= 1:
        EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE
    # SETUP ENDS HERE

    reward = 0
    done = False
    wise = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, DDPG_config)
    critic = CriticNetwork(sess, state_dim, action_dim, DDPG_config)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    ltm = ['a_h0', 'a_h1', 'a_V', 'c_w1', 'c_a1', 'c_h1', 'c_h3', 'c_V']
    layers_to_mind = {}
    L2 = {}

    for k in ltm:
        layers_to_mind[k] = 0
        L2[k] = 0

    vector_to_file(ltm, folder + 'weightsL2' + 'Log.csv', 'w')

    #Now load the weight
    try:
        actor.model.load_weights(folder + "actormodel.h5")
        critic.model.load_weights(folder + "criticmodel.h5")
        actor.target_model.load_weights(folder + "actormodel.h5")
        critic.target_model.load_weights(folder + "criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("OMNeT++ Experiment Start.")
    # initial state of simulator
    s_t = env.reset()
    loss = 0
    for i in range(EPISODE_COUNT):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0
        for j in range(MAX_STEPS):
            print('step ', j)
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            if train_indicator and epsilon > 0 and (step % 1000) // 100 != 9:
                noise_t[0] = epsilon * ou.evolve()

            a = a_t_original[0]
            n = noise_t[0]
            a_t[0] = np.where((a + n > 0) & (a + n < 1), a + n,
                              a - n).clip(min=0, max=1)

            # execute action
            s_t1, r_t, done = env.step(a_t[0], j)
            # print(s_t1)
            print('reward ', r_t)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            scale = lambda x: x
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = scale(np.asarray([e[0] for e in batch]))
            actions = scale(np.asarray([e[1] for e in batch]))
            rewards = scale(np.asarray([e[2] for e in batch]))
            new_states = scale(np.asarray([e[3] for e in batch]))
            dones = np.asarray([e[4] for e in batch])

            y_t = np.zeros([len(batch), action_dim])
            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and len(batch) >= BATCH_SIZE:
                loss = critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                # does this give an output like train_on_batch above? NO
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()
                with open(folder + 'lossLog.csv', 'a') as file:
                    file.write(pretty(loss) + '\n')

            total_reward += r_t
            s_t = s_t1

            for layer in actor.model.layers + critic.model.layers:
                if layer.name in layers_to_mind.keys():
                    L2[layer.name] = np.linalg.norm(
                        np.ravel(layer.get_weights()[0]) -
                        layers_to_mind[layer.name])
                    #                     vector_to_file(np.ravel(layer.get_weights()[0]), folder + 'weights_' + layer.name + 'Log.csv', 'a')
                    layers_to_mind[layer.name] = np.ravel(
                        layer.get_weights()[0])


#             if max(L2.values()) <= 0.02:
#                 wise = True

            if train_indicator and len(batch) >= BATCH_SIZE:
                vector_to_file([L2[x] for x in ltm],
                               folder + 'weightsL2' + 'Log.csv', 'a')

            vector_to_file(a_t_original[0], folder + 'actionLog.csv', 'a')
            vector_to_file(noise_t[0], folder + 'noiseLog.csv', 'a')

            if 'PRINT' in DDPG_config.keys() and DDPG_config['PRINT']:
                print("Episode", "%5d" % i, "Step", "%5d" % step, "Reward",
                      "%.6f" % r_t)
                print("Epsilon", "%.6f" % max(epsilon, 0))

                att_ = np.split(a_t[0], ACTIVE_NODES)
                for _ in range(ACTIVE_NODES):
                    att_[_] = np.insert(att_[_], _, -1)
                att_ = np.concatenate(att_)
                print("Action\n", att_.reshape(ACTIVE_NODES, ACTIVE_NODES))
                print(max(L2, key=L2.get), pretty(max(L2.values())))

            step += 1
            if done or wise:
                break

        if step % 1000 == 0:  # writes at every 1000 step
            if (train_indicator):
                actor.model.save_weights(folder + "actormodel.h5",
                                         overwrite=True)
                actor.model.save_weights(folder + "actormodel" + str(step) +
                                         ".h5")
                with open(folder + "actormodel.json", "w") as outfile:
                    outfile.write(actor.model.to_json(indent=4) + '\n')

                critic.model.save_weights(folder + "criticmodel.h5",
                                          overwrite=True)
                critic.model.save_weights(folder + "criticmodel" + str(step) +
                                          ".h5")
                with open(folder + "criticmodel.json", "w") as outfile:
                    outfile.write(critic.model.to_json(indent=4) + '\n')

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down
    print("Finish.")
Beispiel #27
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    time.sleep(1)
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    pre_model = load_model("weights_rescale_all-0000.hdf5")
    # x = np.array([ 4.82767379e-01,  5.92105016e-02,  3.61700505e-01,  2.74807483e-01,
    #     2.31401995e-01,  2.07236990e-01,  1.95800006e-01,  1.89892501e-01,
    #     1.84837490e-01,  1.81293502e-01,  1.77807003e-01,  1.74377009e-01,
    #     1.71005994e-01,  1.66384503e-01,  1.61247000e-01,  1.52030498e-01,
    #     1.35238498e-01,  1.11962005e-01,  8.79574940e-02,  4.76383008e-02,
    #     4.78339800e-01,  6.97819047e-01,  4.60800716e-01,  5.00754069e-01,
    #     -1.00000000e+00,  9.99979496e-01,  8.71338917e-13])
    # x_s = np.array([x, x])
    # pre_y = pre_model.predict(x_s)
    # print(x_s[0])
    # print(pre_y[0])

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        attack_valid = 1
        gap = (i / 10) / 100.0
        attack_step = -1
        attack_target = 0
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # os.system("scrot saved_pic/{}.png".format(j))
            if j == 80:
                print("cp attack!")
                a_t[0][0] = -1.0
            if j == 83:
                os.system("scrot saved_pic/{}.png".format(j))
            #    if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            # if(step == 60):
            # a_t[0][0] = 1.0
            # s_t_scaled = rescale_state(s_t)
            # # print(s_t[0])
            # s_t_0 = restore_state(s_t_scaled)
            # # print(s_t_0[0])
            # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0]))
            # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)])
            # actions = np.array([np.copy(a_t[0]) for val in range(21)])
            # for val in range(21):
            #     actions[val][0] = -1.0 + val/10.0
            # # print(actions)
            # x_0 = np.hstack((s_t_scaled_list, actions))
            # # print(x_0.shape, s_t_scaled_list.shape, actions.shape)
            # pre_y = pre_model.predict(x_0)
            # # print(x_0[0])
            # # print(pre_y[0])

            # steer_index = int(a_t[0][0]*10.0 + 10.0)
            # for pre_step in range(2):
            #     restore_new_Y = restore_states(pre_y)
            #     actions = actor.model.predict(restore_new_Y)
            #     x_step1 = np.hstack((pre_y, actions))
            #     pre_y = pre_model.predict(x_step1)

            # for index in range(21):
            #     diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index])
            #     pro = np.random.random()
            #     if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50:
            #         a_t[0][0] = -1.0 + index/10.0
            #         print("adv!", diff, "pro:", pro)
            #         attack_step = j
            #         attack_target = a_t[0][0]
            #         attack_valid -= 1

            # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y])
            # print("{:.2f}".format(max(dis_list)*100000))
            # print("{}".format(max(dis_list)*100000))

            # s_t_scaled = np.copy(s_t1)
            # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5)
            # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5)
            # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7)
            # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7)
            # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7)
            # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0]))
            # print(actions[0][0])

            # ob, r_t, done, info = env.step(new_a_t[0])
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])
            # print(a_t[0][0])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))

            # action_states = []
            # for i in range(-5, 6):

            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done]
            # cur_sample.append(cur_step_sample)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 500:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target,
                                             i, j, total_reward)
        attack_valid = 1
        attack_step = -1
        attack_target = 0
        with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        overall_scores.append(total_reward)
        plt.clf()
        plt.plot(overall_scores)
        plt.savefig("train_plots/{}_{}.jpg".format(model_name,
                                                   int(step / 10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        # pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #28
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Now load the weight
    # print("Now we load the weight")
    # try:
    #     actor.model.load_weights("actormodel.h5")
    #     critic.model.load_weights("criticmodel.h5")
    #     actor.target_model.load_weights("actormodel.h5")
    #     critic.target_model.load_weights("criticmodel.h5")
    #     print("Weight load successfully")
    # except:
    #     print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
        print ob.track

        total_reward = 0.
        stucked = 0
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.1:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #29
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 512  #of sensors input

    np.random.seed(61502)

    vision = True

    EXPLORE = 100000.
    episode_count = 600000
    max_steps = 1800
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    esar2 = []
    esar4 = []

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    #We insert the Deep Q Image Processing Module
    args = {
        'save_model_freq': 10000,
        'target_model_update_freq': 10000,
        'normalize_weights': True,
        'learning_rate': .00025,
        'model': None
    }

    # print(args["save_model_freq"])

    C = DeepQNetwork(512, sess, '/home/lou/DDPG-Keras-Torcs', args=args)
    # print(C)

    x, h_fc1 = C.buildNetwork('test', trainable=True, numActions=1)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodelIMG.h5")
        critic.model.load_weights("criticmodelIMG.h5")
        actor.target_model.load_weights("actormodel2IMG.h5")
        critic.target_model.load_weights("criticmodel2IMG.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 500) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 500 episode because of the memory leak error
        else:
            ob = env.reset()

        imgfinal = np.zeros((1, 128, 128, 4), dtype=np.int32)
        s_t = C.getFC7(imgfinal)

        total_reward = 0.

        imglst = []
        speed = 0
        stepreset = 0

        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])

            noise_t = np.zeros([1, action_dim])

            # a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            a_t_original = actor.model.predict(C.getFC7(imgfinal))
            #print('ATORIGINAL', a_t_original)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.05:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            imglst.append(ob.img)

            if len(imglst) == 4:
                imgcopy = imglst[:]
                imgfinal = np.stack(imgcopy)
                # print("Original stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (4, 128, 128))
                # print("Reshaped stacked matrix", imgfinal)

                imgfinal = np.transpose(imgfinal, (1, 2, 0))
                # print("Transposed stacked matrix", imgfinal)

                imgfinal = np.reshape(imgfinal, (1, 128, 128, 4))
                # print("Shape of imgfinal", imgfinal.shape)

            s_t1 = C.getFC7(imgfinal)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            # print('NEW STATES', new_states)

            # target_q_values = critic.target_model.predict([C.getFC7(imgfinal), actor.target_model.predict(C.getFC7(imgfinal))])

            # print('ACTOR TARGET MODEL PREDICT', C.getFC7(imgfinal))
            new_states = np.reshape(new_states, (-1, 512))

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            # print('TARGET Q VALUES', target_q_values)
            # print('NEW STATES', new_states)
            # print('ACTOR MODEL PREDICT NEW STATES', actor.target_model.predict(new_states))
            # print('REWARDS', rewards)

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                states = np.reshape(states, (-1, 512))

                print('STATESSHAPE', np.shape(states))
                print('ACTIONSSHAPE', np.shape(actions))
                print('YT', np.shape(y_t))

                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1
            speed += ob.speedX * 300
            speedavg = speed / stepreset
            #print("SPEED X", ob.speedX)

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss, "Average Speed", speedavg)
            esar = (i, step, a_t, r_t, loss, speedavg)
            esar2.append(esar)

            step += 1
            stepreset += 1

            if len(imglst) >= 4:
                del imglst[0]

            # print("Length of imglist", len(imglst))
            # print("List itself", imgfinal)

            if done:
                break

        if np.mod(i, 50) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodelIMG.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodelIMG.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

        esar3 = (i, step, total_reward, speedavg)
        esar4.append(esar3)

        if np.mod(i, 50) == 0:
            save_object(esar2, 'IntraEpisode.pkl')
            save_object(esar4, 'InterEpisode.pkl')

    env.end()  # This is for shutting down TORCS
    print("Finish.")
    print("Saving esars.")
Beispiel #30
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
    if train_indicator:
        episode_count = 1000
    else:
        episode_count = 20
    max_steps = 4000
    step = 0
    if train_indicator:
        epsilon = 1
    else:
        epsilon = 0
    min_laptime = 10000000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # loading networks
    print("Now we load the weight")
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        # totalLaptime = 0.
        for j in range(max_steps):
            loss = 0
            if train_indicator:
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.10)
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0], train_indicator)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.train_on_batch(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 100) == 0:
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime)
        
            step += 1
            if i == 0:
                break
            if done:
                break

        # if np.mod(i, 3) == 0:
        if (train_indicator) and i > 0:
            if env.lapTime < min_laptime and env.num_lap == 10:
                min_laptime = env.lapTime
                print("Now we save model")
                saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i))

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Beispiel #31
0
def playGame(train_indicator=0):  # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.00001  # Learning rate for Actor
    LRC = 0.0001  # Lerning rate for Critic

    server_number = 5
    # node_number = 18
    hot_node_number = 150
    action_dim = hot_node_number  # Number of servers
    state_dim = hot_node_number * (server_number + 1 + 10
                                   )  # 1000 node * 10 features
    # baseline = 4e-05 #load&locality of baselines

    np.random.seed(500)

    # vision = False

    EXPLORE = 100000.
    episode_count = 100
    max_steps = 100000
    line_number = 1000
    step_number = 35
    # reward = 0
    done = False
    step = 0
    epsilon = 1
    # indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a MDS environment
    env = MetaEnvironment(server_number)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("model/actormodel-" + str(server_number) +
                                 ".h5")
        critic.model.load_weights("model/criticmodel-" + str(server_number) +
                                  ".h5")
        actor.target_model.load_weights("model/actormodel-" +
                                        str(server_number) + ".h5")
        critic.target_model.load_weights("model/criticmodel-" +
                                         str(server_number) + ".h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("Experiment Start.")

    f = open("query.txt")
    queryList = []
    for line in f.readlines():
        line = line.strip()
        queryList.append(line)
    f.close()

    sumLoc = 0
    sumLod = 0
    lossList = []
    mdsLoadList = [[] for x in range(server_number)]

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        # ob = env.reset(relaunch=True)   #relaunch every 3 episode because of the memory leak error
        # else:
        # ob = env.reset()

        traceList = queryList[0:line_number]  # Reset
        s_t = env.state(traceList)  # Get State from env

        localityList = []
        loadList = []

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            # add noise
            a_t_original = actor.model.predict(s_t)
            for k in range(action_dim):
                noise_t[0][k] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][k], 0.0, 0.60,
                                              0.30)

            for m in range(action_dim):
                a_t[0][m] = a_t_original[0][m]  # + noise_t[0][m]

            migration = env.take_actions(a_t[0])
            print("migration", migration)

            tracelist = queryList[(j + 1) * line_number:(j + 2) * line_number]
            s_t1 = env.state(tracelist)  # Update state from env
            # r_t = 0.5*env.locality() + 50*env.load() - baseline
            # print("gagaga", 1e5*env.locality() + 1e7*env.load())
            # 1.5, 3, 2
            x = 1e5 * env.locality() + 1e7 * env.load() - 1.5 * migration
            # x = 1e5*env.locality() + 1.5 * 1e7*env.load()
            # r_t = 1.0 / (1.0 + np.exp(-(x/50)))
            r_t = x

            if j == step_number:
                done = True
            else:
                done = False

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])
            states = states.reshape(len(batch), -1)
            new_states = new_states.reshape(len(batch), -1)
            actions = actions.reshape(len(batch), -1)

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load())
            print("Episode", i, "Step", step, "Reward", r_t, "Loss", loss,
                  "Locality", env.locality(), "Load", env.load())

            lossList.append(loss)
            localityList.append(env.locality())
            loadList.append(env.load())
            for index in range(server_number):
                mdsLoadList[index].append(env.loadList[index])

            step += 1
            if done:
                break

        curLocalitySum = sum(localityList)
        curLoadSum = sum(loadList)

        # f = open('' + str(server_number) + '.txt', 'w')
        # f.write(','.join(map(str, lossList)))
        # f.close()

        # f = open('anglecut-mdsload-' + str(server_number) + '.txt', 'w')
        # for i in range(server_number):
        #     f.write(','.join(map(str, mdsLoadList[i])))
        #     f.write('\n')
        # f.close()
        # print("写入成功")

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("model/actormodel-" +
                                         str(server_number) + ".h5",
                                         overwrite=True)
                with open("model/actormodel-" + str(server_number) + ".json",
                          "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("model/criticmodel-" +
                                          str(server_number) + ".h5",
                                          overwrite=True)
                with open("model/criticmodel-" + str(server_number) + ".json",
                          "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        # print("Final Locality:", env.final_locality(), "Final Load Balancing:", env.final_load())
        # env.clear()
        print("")

    # env.end()
    print("Finish.")