예제 #1
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
    if train_indicator:
        episode_count = 1000
    else:
        episode_count = 20
    max_steps = 4000
    step = 0
    if train_indicator:
        epsilon = 1
    else:
        epsilon = 0
    min_laptime = 10000000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # loading networks
    print("Now we load the weight")
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        # totalLaptime = 0.
        for j in range(max_steps):
            loss = 0
            if train_indicator:
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.10)
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0], train_indicator)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.train_on_batch(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 100) == 0:
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime)
        
            step += 1
            if i == 0:
                break
            if done:
                break

        # if np.mod(i, 3) == 0:
        if (train_indicator) and i > 0:
            if env.lapTime < min_laptime and env.num_lap == 10:
                min_laptime = env.lapTime
                print("Now we save model")
                saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i))

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #2
0
        actions = np.asarray([e[1] for e in batch])
        rewards = np.asarray([e[2] for e in batch])
        new_states = np.asarray([e[3] for e in batch])
        dones = np.asarray([e[4] for e in batch])

        # set target yi = ri + gamma*target_critic_network(si+1, target_actor_network(si+1))
        #print "Debuggg"
        #print len(batch)
        #print new_states.shape
        new_states = new_states.reshape([len(batch), new_states.shape[1]])
        s_t1.reshape([s_t1.shape[0], s_t1.shape[1]])        
        #print new_states.shape        
        #print states.shape
        #print s_t1.shape        
        
        target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))

        y_t = []
        for i in range(len(batch)):
            if dones[i]:
                y_t.append(rewards[i])
            else:
                y_t.append(rewards[i] + GAMMA*target_q_values[i])

        # update critic network by minimizing los L = 1/N sum(yi - critic_network(si,ai))**2
             
        y_t = np.array(y_t).reshape([len(y_t), 1])
        
        
        critic.train(y_t, states, actions)
예제 #3
0
파일: ddpg_hockey.py 프로젝트: ataitler/DQN
	if done:
		dt = 1
	else:
		dt = 0

        # store transition in replay buffer
        buff.add(s_t, a_t[0], r_t, s_t1, dt)
        # sample a random minibatch of N transitions (si, ai, ri, si+1) from replay buffer
        batch = buff.getBatch(BATCH_SIZE)
        states = np.asarray([e[0] for e in batch])
        actions = np.asarray([e[1] for e in batch])
        rewards = np.asarray([e[2] for e in batch])
        new_states = np.asarray([e[3] for e in batch])
        dones = np.asarray([e[4] for e in batch])
        
	target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
	rt = rewards.reshape(rewards.size,1)
	dones = dones.reshape(dones.size,1)
        y_t = rt + GAMMA*target_q_values*(1-dones)
        
	# update critic network by minimizing los L = 1/N sum(yi - critic_network(si,ai))**2
        critic.train(y_t, states, actions)

        # update actor policy using sampled policy gradient
        a_for_grad = actor.predict(states)
        grads = critic.gradients(states, a_for_grad)
        actor.train(states, grads)

        # update the target networks
        actor.target_train()
        critic.target_train()
예제 #4
0
파일: ddpg.py 프로젝트: murthy95/DDPG_tf
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = [64, 64,
                 3]  # of sensors input since only one frame as observation

    np.random.seed(1337)

    vision = True  #changing vsion to true

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    try:
        saved_actor_weights = tf.train.Saver.restore(sess,
                                                     'actor_weights.ckpt')
        sess.run(tf.assign(actor.weights, saved_actor_weights))
        saved_critic_weights = tf.train.Saver.restore(sess,
                                                      'critic_weights.ckpt')
        sess.run(tf.assign(critic.weights, saved_critic_weights))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        # s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        s_t = ob.img
        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.predict(s_t.reshape(state_dim))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            #s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            s_t1 = ob.img.reshape(state_dim)
            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            print "Do the batch update"
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(
                new_states, actor.target_predict(new_states))

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.train(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                # actor.model.save_weights("actormodel.h5", overwrite=True)
                saver_actor = tf.train.Saver(var_list=actor.weights,
                                             filename='actor_weights')
                # with open("actormodel.json", "w") as outfile:
                #    json.dump(actor.model.to_json(), outfile)

                saver_critic = tf.train.Saver(var_list=critic.weights,
                                              filename='critic_weights')
                # critic.model.save_weights("criticmodel.h5", overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #5
0
class DriverAgent:
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DriverAgent'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Tensorflow Session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        # Actor & Critic Network
        self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE,
                                  TAU, LRA)
        self.critic = CriticNetwork(self.sess, state_dim, action_dim,
                                    BATCH_SIZE, TAU, LRA)

        # Replay Memory
        self.memory = ReplayMemory(MEMORY_SIZE)

        # Loss value
        self.loss = 0

        # loading networks. modify as you want
        self.saver = tf.train.Saver()
        if not os.path.exists(ckp_dir):
            print("Could not find old network weights")
        else:
            self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name))
            print("Successfully loaded:", ckp_name)

    # Train code
    def train(self, state, action, reward, next_state, done):
        # Add information to the replay memory
        if (not (math.isnan(reward))):
            self.memory.add(state, action, reward, next_state, done)

        if self.memory.count() <= START_REPLAY:
            return

        # Get batch from the replay memory
        batch = self.memory.getBatch(BATCH_SIZE)
        states = np.asarray([e[0] for e in batch])
        actions = np.asarray([e[1] for e in batch])
        rewards = np.asarray([e[2] for e in batch])
        new_states = np.asarray([e[3] for e in batch])
        dones = np.asarray([e[4] for e in batch])

        # Get target Q value of the critic network
        target_Q = self.critic.target_predict(
            [new_states, self.actor.target_predict(new_states)])

        # Calculate answer(???) < I cannot rememeber name
        y_t = []
        for i in range(len(batch)):
            if dones[i]:
                y_t.append(rewards[i])
            else:
                y_t.append(rewards[i] + GAMMA * target_Q[i])
        y_t = np.resize(y_t, [BATCH_SIZE, 1])

        # Calculate loss value and gradient for each network, and train both
        _, loss = self.critic.train([states, actions], y_t)

        a_for_grad = self.actor.predict(states)
        grads = self.critic.gradients(states, a_for_grad)

        self.actor.train(states, grads)

        self.actor.target_train()
        self.critic.target_train()

    # save your own network
    def saveNetwork(self, episode):
        if not os.path.exists(ckp_dir):
            os.mkdir(ckp_dir)
        ckp_name_real = ckp_name + '_' + str(episode)
        self.saver.save(self.sess, os.path.join(ckp_dir, ckp_name_real))
        pass

    def action(self, state):
        # return an action by state.
        action = np.zeros([self.action_dim])
        action_pre = self.actor.predict([state])

        # ACTION: without noise
        action[0] = np.clip(action_pre[0][0], -1, 1)
        action[1] = np.clip(action_pre[0][1], 0, 1)
        action[2] = np.clip(action_pre[0][2], 0, 1)

        return action

    def noise_action(self, state, epsilon):
        # return an action according to the current policy and exploration noise
        action = np.zeros([self.action_dim])
        noise = np.zeros([self.action_dim])

        action_pre = self.actor.predict([state])

        noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60)
        noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10)
        noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05)

        # ACTION: with noise
        action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1)
        action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1)
        action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1)

        return action