def viz_noise(n_samples=1000, iter=0.): actions = np.linspace(-1, 1, 20) epsilon = 1 - iter / EXPLORE ou = OU() noise = np.zeros_like(actions) for i in range(n_samples): noise += epsilon * ou.function(actions, 0.0, 0.60, 0.30) noise /= n_samples import matplotlib.pyplot as plt plt.plot(actions, noise) plt.show()
def startTraining(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic print('ACTION DIM: ', RC.GB_ACTION_DIM) action_dim = RC.GB_ACTION_DIM # Each contacting scenario consists of hand and plate state (well enough to be used as Environment Observation) print('STATE DIM: ', RC.GB_STATE_DIM) state_dim = RC.GB_STATE_DIM np.random.seed(1337) vision = False EXPLORE = 1000. #100000. # Double loops of episodes and step: # --> To make the env reset in case the agent learns too successfully without failing (done), avoid outfitting (learning by heart, instead of exploring new ways/actions) # A new episode is designed to proceed to if done (termination) or a threshold (max_steps) is reached. episode_count = 1000000 max_steps = 10000 # As some certain value to avoid underfitting reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer print('START ENV', gbClientID, gbRobotHandle) env = RobotOperationEnvironment(gbClientID, RC.GB_CSERVER_ROBOT_ID, gbRobotHandle) ## --------------------------------------------------------------- #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight loaded successfully!") print("######################################################") print("######################################################") print("######################################################") except: print("Cannot find the weight") print("Manipulator DDPG Training Experiment Start.") for episode in range(episode_count): if (RC.GB_TRACE): print("Episode : " + str(episode) + " Replay Buffer " + str(buff.count())) total_reward = 0. for j in range(max_steps): if (RC.isUnknownTask() or episode == 0): ob = env.reset() else: #We take the ob from the previous step, since the reset returns meaningless value env.reset() #s_t = np.reshape(ob, (-1, action_dim)) s_t = gb_observation_2_state(ob) #print('OB', s_t) ## ------------------------------------------------------------------------------------------------------- loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) #print("ST RESHAPE", np.reshape(s_t, (1, s_t.shape[0]))) #if(j!=0): print('Episode ', episode, 'Step ', j, '--------------') print('Start waiting for the next action', env._robot.getOperationState()) while (env._robot.getOperationState() != RC.CROBOT_STATE_READY): time.sleep(0.01) # -------------------------------------------------------------------------------------------------------- a_t_original = actor.model.predict( np.reshape(s_t, (1, s_t.shape[0]))) print('Generated action:', a_t_original) #print("a_t", a_t) #print("noise_t", noise_t) #print("a_t_original", a_t_original) for i in range(action_dim): noise_t[0][i] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][i], 0.0, 0.60, 0.30) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) for i in range(action_dim): a_t[0][i] = a_t_original[0][i] + noise_t[0][i] ob, r_t, done, info = env.step(a_t[0]) s_t1 = gb_observation_2_state(ob) #print('OB reshape', s_t1) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) #print('New State:', new_states) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) #print('target_q_values:', target_q_values) #print('batch:', len(batch)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if (RC.GB_TRACE): print("Episode", episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) # End for on steps if np.mod(j, 3) == 0: if (train_indicator): if (RC.GB_TRACE): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) if np.mod(j, 10) == 0: print("TOTAL REWARD @ " + str(episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") step += 1 if done: break print("Finish.")
#from keras.engine.training import collect_trainable_weights import json # DDPG from ddpg.ReplayBuffer import ReplayBuffer from ddpg.ActorNetworkObjSupport import ActorNetwork from ddpg.CriticNetwork import CriticNetwork from ddpg.OU import OU import timeit # MATPLOT import matplotlib as mpl from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt OU = OU() #Ornstein-Uhlenbeck Process try: import vrep except: print('--------------------------------------------------------------') print('"vrep.py" could not be imported. This means very probably that') print('either "vrep.py" or the remoteApi library could not be found.') print('Make sure both are in the same folder as this file,') print('or appropriately adjust the file "vrep.py"') print('--------------------------------------------------------------') print('') CSERVER_PORT = 19999 ############################################################################################################################################################## ##############################################################################################################################################################
def startTraining(train_indicator=0): #1 means Train, 0 means simply Run BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 #Target Network HyperParameters LRA = 0.0001 #Learning rate for Actor LRC = 0.001 #Lerning rate for Critic action_dim = 7 # Joint Movement # Each contacting scenario consists of hand and plate state (well enough to be used as Environment Observation) state_dim = 20 # Joint Ball Pos + Velocity np.random.seed(1337) vision = False EXPLORE = 100000. episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 #Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer env = KukaCatchObjsGymEnv(renders=True) ## --------------------------------------------------------------- #Now load the weight print("Now we load the weight") try: actor.model.load_weights("actormodel.h5") critic.model.load_weights("criticmodel.h5") actor.target_model.load_weights("actormodel.h5") critic.target_model.load_weights("criticmodel.h5") print("Weight load successfully") print("######################################################") print("######################################################") print("######################################################") except: print("Cannot find the weight") print("Falling obj catching Experiment Start.") for episode in range(episode_count): if (gb_trace): print("Episode : " + str(episode) + " Replay Buffer " + str(buff.count())) ob = env.reset() #s_t = np.reshape(ob, (-1, action_dim)) s_t = np.hstack(( ob[0], ob[1], ob[2], ob[3], ob[4], ob[5], ob[6], # Joint i (pos & vel) ob[7], ob[8], ob[9], ob[10], ob[11], ob[12], ob[13], ob[14], ob[15], ob[16], # Ball pos X,Y,Z ob[17], ob[18], ob[19] # Ball linear vel X,Y,Z )) #print('OB', s_t) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) #print("ST RESHAPE", s_t.reshape(1, s_t.shape[0]), s_t.shape[0]) a_t_original = actor.model.predict( np.reshape(s_t, (1, s_t.shape[0]))) #print("a_t", a_t) #print("noise_t", noise_t) #print("a_t_original", a_t_original) for i in range(action_dim): noise_t[0][i] = train_indicator * max( epsilon, 0) * OU.function(a_t_original[0][i], 0.0, 0.60, 0.30) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) for i in range(action_dim): a_t[0][i] = a_t_original[0][i] + noise_t[0][i] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack(( ob[0], ob[1], ob[2], ob[3], ob[4], ob[5], ob[6], # Joint i (pos & vel) ob[7], ob[8], ob[9], ob[10], ob[11], ob[12], ob[13], ob[14], ob[15], ob[16], # Ball pos X,Y,Z ob[17], ob[18], ob[19] # Ball linear vel X,Y,Z )) #print('OB reshape', s_t1) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer #Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) #print('New State:', new_states) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) #print('target_q_values:', target_q_values) #print('batch:', len(batch)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train_indicator): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 #if(gb_trace): #print("Episode", episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(episode, 3) == 0: if (train_indicator): if (gb_trace): print("Now we save model") actor.model.save_weights("actormodel.h5", overwrite=True) with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("criticmodel.h5", overwrite=True) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) if (gb_trace): print("TOTAL REWARD @ " + str(episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") print("Finish.")
def train(sess,image_agent,continue_train=False): BUFFER_SIZE = 100000 BATCH_SIZE = 128 GAMMA = 0.9 TAU = 0.001 INIT_LRA = 0.000001 INIT_LRC = 0.0001 EPISODE_MAX_STEP = 5000 # DECAY_RATE = 0.5 # DECAY_STEP = 3000000 #TOTAL_EPISODE = 30000 TOTAL_EPISODE = 20000 EXPLORE = 500000 CURRENT_STEP=0 actor = ActorNetwork(sess,BATCH_SIZE,TAU,INIT_LRA) critic = CriticNetwork(sess,BATCH_SIZE,TAU,INIT_LRC) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() sess.graph.finalize() ou = OU() # if continue_train: # #TODO: reload network and params # pass buffer_follow = ReplayBuffer(BUFFER_SIZE) buffer_straight = ReplayBuffer(BUFFER_SIZE) buffer_left = ReplayBuffer(BUFFER_SIZE) buffer_right = ReplayBuffer(BUFFER_SIZE) buffer_dict = {0:buffer_follow,1:buffer_left,2:buffer_right,3:buffer_straight} epsilon = 1.0 env = Env("./log","./data",image_agent) #env.reset() for i in range(TOTAL_EPISODE): try: ob = env.reset() except Exception: continue total_reward = 0 episode_step = 0 s_t = ob for j in range(EPISODE_MAX_STEP): if s_t is None or len(s_t)<514: continue epsilon-=1.0/ EXPLORE image_input = s_t[0:-2] speed_input = s_t[-2:-1] #GO_STRAIGHT = 5.0,TURN_RIGHT = 4.0,TURN_LEFT = 3.0,LANE_FOLLOW = 2.0 direction = s_t[-1:] branch_st = int(direction-2) if branch_st == -2: # REACH_GOAL=0 break a_t=np.zeros([1,3]) #steer throttle brake noise_t = np.zeros([1,3]) a_t_pridect = actor.pridect_action(image_input,speed_input,branch_st) noise_t[0][0] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0,0.6,0.3) noise_t[0][1] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0.5,1,0.1) noise_t[0][2] = max(epsilon,0)*ou.function(a_t_pridect[0][0],-0.1,1,0.05) a_t = a_t_pridect+noise_t # if(CURRENT_STEP<10000) and j<50: # a_t[0][2]=0 # a_t[0][1]=max(0.6,a_t[0][1]) try: ob,r_t,done = env.step(a_t[0]) s_t1 = ob if s_t1 is None or len(s_t1)<514: continue buffer_dict[branch_st].add(s_t,a_t[0],r_t,s_t1,done) except Exception: break # train Actor and Critic branch_to_train = random.choice([0,1,2,3]) if buffer_dict[branch_to_train].count()>128: train_ddpg(actor,critic,buffer_dict,BATCH_SIZE,branch_to_train) total_reward+=r_t s_t = s_t1 CURRENT_STEP+=1 episode_step+=1 if (done): break print("buffer lenth:{},{},{},{},total reward:{},current_step:{},total_step:{}".format(buffer_dict[0].count(), buffer_dict[1].count(), buffer_dict[2].count(), buffer_dict[3].count(), total_reward,episode_step,CURRENT_STEP)) if np.mod(i,2000)==0: saver.save(sess,'./model/ddpg_model') with open("./episode.txt","w") as log: log.write(("{},{}\n").format(i,epsilon)) with open("./buffer.pkl","wb") as buffer_log: pickle.dump(buffer_dict, buffer_log)
def playGame(actor, critic, train=False): GAMMA = 0.99 vision = False episode_count = 2000 max_steps = 100000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 buff = ReplayBuffer(BUFFER_SIZE) #Create replay buffer ou = OU() # Ornstein-Uhlenbeck Process # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=False, gear_change=False) print("TORCS Experiment Start.") for n_episode in range(episode_count): print("Episode : " + str(n_episode) + " Replay Buffer " + str(buff.count())) ob = env.reset() s_t = np.hstack( (ob.angle, ob.trackPos) ) # ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train * max(epsilon, 0) * ou.function( a_t_original[0][0], 0.0, 0.60, 0.30) #noise_t[0][1] = train * max(epsilon, 0) * ou.function(a_t_original[0][1], 0.5, 1.00, 0.10) #noise_t[0][2] = train * max(epsilon, 0) * ou.function(a_t_original[0][2], -0.1, 1.00, 0.05) #The following code do the stochastic brake #if random.random() <= 0.1: # print("********Now we apply the brake***********") # noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], 0.2 , 1.00, 0.10) for i in range(action_dim): a_t[0][i] = a_t_original[0][i] + noise_t[0][i] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.angle, ob.trackPos) ) #, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) buff.add(s_t, a_t[0], r_t, s_t1, done) #Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if (train): loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 if np.mod(n_episode, 10) == 0: print("Episode", n_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) step += 1 if done: break if np.mod(n_episode, 3) == 0: if (train): print("Now we save model") actor.model.save_weights("data/actormodel.h5", overwrite=True) with open("data/ctormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights("data/criticmodel.h5", overwrite=True) with open("data/criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) print("TOTAL REWARD @ " + str(n_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("") env.end() # This is for shutting down TORCS print("Finish.")