コード例 #1
0
def viz_noise(n_samples=1000, iter=0.):
    actions = np.linspace(-1, 1, 20)
    epsilon = 1 - iter / EXPLORE
    ou = OU()
    noise = np.zeros_like(actions)
    for i in range(n_samples):
        noise += epsilon * ou.function(actions, 0.0, 0.60, 0.30)
    noise /= n_samples
    import matplotlib.pyplot as plt
    plt.plot(actions, noise)
    plt.show()
コード例 #2
0
def startTraining(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    print('ACTION DIM: ', RC.GB_ACTION_DIM)
    action_dim = RC.GB_ACTION_DIM
    # Each contacting scenario consists of hand and plate state (well enough to be used as Environment Observation)
    print('STATE DIM: ', RC.GB_STATE_DIM)
    state_dim = RC.GB_STATE_DIM

    np.random.seed(1337)

    vision = False

    EXPLORE = 1000.  #100000.
    # Double loops of episodes and step:
    # --> To make the env reset in case the agent learns too successfully without failing (done), avoid outfitting (learning by heart, instead of exploring new ways/actions)
    # A new episode is designed to proceed to if done (termination) or a threshold (max_steps) is reached.
    episode_count = 1000000
    max_steps = 10000  # As some certain value to avoid underfitting
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    print('START ENV', gbClientID, gbRobotHandle)
    env = RobotOperationEnvironment(gbClientID, RC.GB_CSERVER_ROBOT_ID,
                                    gbRobotHandle)

    ## ---------------------------------------------------------------

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight loaded successfully!")
        print("######################################################")
        print("######################################################")
        print("######################################################")
    except:
        print("Cannot find the weight")

    print("Manipulator DDPG Training Experiment Start.")
    for episode in range(episode_count):

        if (RC.GB_TRACE):
            print("Episode : " + str(episode) + " Replay Buffer " +
                  str(buff.count()))

        total_reward = 0.
        for j in range(max_steps):

            if (RC.isUnknownTask() or episode == 0):
                ob = env.reset()
            else:  #We take the ob from the previous step, since the reset returns meaningless value
                env.reset()
            #s_t = np.reshape(ob, (-1, action_dim))
            s_t = gb_observation_2_state(ob)
            #print('OB', s_t)

            ## -------------------------------------------------------------------------------------------------------
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            #print("ST RESHAPE", np.reshape(s_t, (1, s_t.shape[0])))
            #if(j!=0):
            print('Episode ', episode, 'Step ', j, '--------------')
            print('Start waiting for the next action',
                  env._robot.getOperationState())
            while (env._robot.getOperationState() != RC.CROBOT_STATE_READY):
                time.sleep(0.01)

            # --------------------------------------------------------------------------------------------------------
            a_t_original = actor.model.predict(
                np.reshape(s_t, (1, s_t.shape[0])))
            print('Generated action:', a_t_original)

            #print("a_t", a_t)
            #print("noise_t", noise_t)
            #print("a_t_original", a_t_original)
            for i in range(action_dim):
                noise_t[0][i] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][i], 0.0, 0.60,
                                              0.30)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            for i in range(action_dim):
                a_t[0][i] = a_t_original[0][i] + noise_t[0][i]
            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = gb_observation_2_state(ob)
            #print('OB reshape', s_t1)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)

            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            #print('New State:', new_states)
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            #print('target_q_values:', target_q_values)
            #print('batch:', len(batch))
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if (RC.GB_TRACE):
                print("Episode", episode, "Step", step, "Action", a_t,
                      "Reward", r_t, "Loss", loss)

            # End for on steps
            if np.mod(j, 3) == 0:
                if (train_indicator):
                    if (RC.GB_TRACE):
                        print("Now we save model")
                    actor.model.save_weights("actormodel.h5", overwrite=True)
                    with open("actormodel.json", "w") as outfile:
                        json.dump(actor.model.to_json(), outfile)

                    critic.model.save_weights("criticmodel.h5", overwrite=True)
                    with open("criticmodel.json", "w") as outfile:
                        json.dump(critic.model.to_json(), outfile)

            if np.mod(j, 10) == 0:
                print("TOTAL REWARD @ " + str(episode) +
                      "-th Episode  : Reward " + str(total_reward))
                print("Total Step: " + str(step))
                print("")

            step += 1

            if done:
                break

    print("Finish.")
コード例 #3
0
#from keras.engine.training import collect_trainable_weights
import json

# DDPG
from ddpg.ReplayBuffer import ReplayBuffer
from ddpg.ActorNetworkObjSupport import ActorNetwork
from ddpg.CriticNetwork import CriticNetwork
from ddpg.OU import OU
import timeit

# MATPLOT
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

OU = OU()  #Ornstein-Uhlenbeck Process

try:
    import vrep
except:
    print('--------------------------------------------------------------')
    print('"vrep.py" could not be imported. This means very probably that')
    print('either "vrep.py" or the remoteApi library could not be found.')
    print('Make sure both are in the same folder as this file,')
    print('or appropriately adjust the file "vrep.py"')
    print('--------------------------------------------------------------')
    print('')

CSERVER_PORT = 19999
##############################################################################################################################################################
##############################################################################################################################################################
コード例 #4
0
def startTraining(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 7  # Joint Movement
    # Each contacting scenario consists of hand and plate state (well enough to be used as Environment Observation)
    state_dim = 20  # Joint Ball Pos + Velocity

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    env = KukaCatchObjsGymEnv(renders=True)

    ## ---------------------------------------------------------------

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
        print("######################################################")
        print("######################################################")
        print("######################################################")
    except:
        print("Cannot find the weight")

    print("Falling obj catching Experiment Start.")
    for episode in range(episode_count):

        if (gb_trace):
            print("Episode : " + str(episode) + " Replay Buffer " +
                  str(buff.count()))

        ob = env.reset()

        #s_t = np.reshape(ob, (-1, action_dim))
        s_t = np.hstack((
            ob[0],
            ob[1],
            ob[2],
            ob[3],
            ob[4],
            ob[5],
            ob[6],  # Joint i (pos & vel)
            ob[7],
            ob[8],
            ob[9],
            ob[10],
            ob[11],
            ob[12],
            ob[13],
            ob[14],
            ob[15],
            ob[16],  # Ball pos X,Y,Z
            ob[17],
            ob[18],
            ob[19]  # Ball linear vel X,Y,Z
        ))
        #print('OB', s_t)

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            #print("ST RESHAPE", s_t.reshape(1, s_t.shape[0]), s_t.shape[0])
            a_t_original = actor.model.predict(
                np.reshape(s_t, (1, s_t.shape[0])))

            #print("a_t", a_t)
            #print("noise_t", noise_t)
            #print("a_t_original", a_t_original)
            for i in range(action_dim):
                noise_t[0][i] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][i], 0.0, 0.60,
                                              0.30)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            for i in range(action_dim):
                a_t[0][i] = a_t_original[0][i] + noise_t[0][i]
            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((
                ob[0],
                ob[1],
                ob[2],
                ob[3],
                ob[4],
                ob[5],
                ob[6],  # Joint i (pos & vel)
                ob[7],
                ob[8],
                ob[9],
                ob[10],
                ob[11],
                ob[12],
                ob[13],
                ob[14],
                ob[15],
                ob[16],  # Ball pos X,Y,Z
                ob[17],
                ob[18],
                ob[19]  # Ball linear vel X,Y,Z
            ))
            #print('OB reshape', s_t1)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)

            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            #print('New State:', new_states)
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            #print('target_q_values:', target_q_values)
            #print('batch:', len(batch))
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            #if(gb_trace):
            #print("Episode", episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(episode, 3) == 0:
            if (train_indicator):
                if (gb_trace):
                    print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        if (gb_trace):
            print("TOTAL REWARD @ " + str(episode) + "-th Episode  : Reward " +
                  str(total_reward))
            print("Total Step: " + str(step))
            print("")

    print("Finish.")
コード例 #5
0
def train(sess,image_agent,continue_train=False):
    BUFFER_SIZE = 100000
    BATCH_SIZE = 128
    GAMMA = 0.9 
    TAU = 0.001 
    INIT_LRA = 0.000001
    INIT_LRC = 0.0001 
    EPISODE_MAX_STEP = 5000
    # DECAY_RATE = 0.5 
    # DECAY_STEP = 3000000
    #TOTAL_EPISODE = 30000
    TOTAL_EPISODE = 20000
    EXPLORE = 500000
    CURRENT_STEP=0
    actor = ActorNetwork(sess,BATCH_SIZE,TAU,INIT_LRA)
    critic = CriticNetwork(sess,BATCH_SIZE,TAU,INIT_LRC)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    sess.graph.finalize()
    ou = OU()
    # if continue_train:
    #     #TODO: reload network  and  params
    #     pass
    buffer_follow = ReplayBuffer(BUFFER_SIZE)
    buffer_straight = ReplayBuffer(BUFFER_SIZE)
    buffer_left = ReplayBuffer(BUFFER_SIZE)
    buffer_right = ReplayBuffer(BUFFER_SIZE)
    buffer_dict = {0:buffer_follow,1:buffer_left,2:buffer_right,3:buffer_straight}
   
    epsilon = 1.0

    env = Env("./log","./data",image_agent)
    #env.reset()
    
    for i in range(TOTAL_EPISODE):
        try:
            ob = env.reset()
        except Exception:
            continue
        total_reward = 0
        episode_step = 0
        s_t = ob
        for j in range(EPISODE_MAX_STEP):
            if s_t is None or len(s_t)<514:
                continue
            epsilon-=1.0/ EXPLORE
            image_input = s_t[0:-2]
            speed_input = s_t[-2:-1]
            #GO_STRAIGHT = 5.0,TURN_RIGHT = 4.0,TURN_LEFT = 3.0,LANE_FOLLOW = 2.0
            direction = s_t[-1:] 
            branch_st = int(direction-2)
            if branch_st == -2:  # REACH_GOAL=0
                break
            a_t=np.zeros([1,3]) #steer throttle brake 
            noise_t = np.zeros([1,3])    
            a_t_pridect = actor.pridect_action(image_input,speed_input,branch_st)
            noise_t[0][0] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0,0.6,0.3)
            noise_t[0][1] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0.5,1,0.1)
            noise_t[0][2] = max(epsilon,0)*ou.function(a_t_pridect[0][0],-0.1,1,0.05)
            a_t = a_t_pridect+noise_t
            # if(CURRENT_STEP<10000) and  j<50:
            #      a_t[0][2]=0
            #      a_t[0][1]=max(0.6,a_t[0][1])
            try:
                ob,r_t,done = env.step(a_t[0])
                s_t1 = ob
                if s_t1 is None or len(s_t1)<514:
                    continue
                buffer_dict[branch_st].add(s_t,a_t[0],r_t,s_t1,done)
            except Exception:
                break

            

            # train Actor and  Critic
            branch_to_train = random.choice([0,1,2,3])
            if buffer_dict[branch_to_train].count()>128:
                train_ddpg(actor,critic,buffer_dict,BATCH_SIZE,branch_to_train)
            total_reward+=r_t
            s_t = s_t1
            CURRENT_STEP+=1
            episode_step+=1
            if (done):
                break
        
        print("buffer lenth:{},{},{},{},total reward:{},current_step:{},total_step:{}".format(buffer_dict[0].count(),
                    buffer_dict[1].count(),
                    buffer_dict[2].count(),
                    buffer_dict[3].count(),
                    total_reward,episode_step,CURRENT_STEP))
        
        if np.mod(i,2000)==0:
            saver.save(sess,'./model/ddpg_model')
            with open("./episode.txt","w") as log:
                log.write(("{},{}\n").format(i,epsilon))
            with open("./buffer.pkl","wb") as buffer_log:
                pickle.dump(buffer_dict, buffer_log)
コード例 #6
0
def playGame(actor, critic, train=False):
    GAMMA = 0.99

    vision = False

    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer
    ou = OU()  # Ornstein-Uhlenbeck Process

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=False, gear_change=False)

    print("TORCS Experiment Start.")
    for n_episode in range(episode_count):

        print("Episode : " + str(n_episode) + " Replay Buffer " +
              str(buff.count()))

        ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.trackPos)
        )  # ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            noise_t[0][0] = train * max(epsilon, 0) * ou.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            #noise_t[0][1] = train * max(epsilon, 0) * ou.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            #noise_t[0][2] = train * max(epsilon, 0) * ou.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            for i in range(action_dim):
                a_t[0][i] = a_t_original[0][i] + noise_t[0][i]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.trackPos)
            )  #, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(n_episode, 10) == 0:
                print("Episode", n_episode, "Step", step, "Action", a_t,
                      "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(n_episode, 3) == 0:
            if (train):
                print("Now we save model")
                actor.model.save_weights("data/actormodel.h5", overwrite=True)
                with open("data/ctormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("data/criticmodel.h5",
                                          overwrite=True)
                with open("data/criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(n_episode) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")