def train(u_init, play_type="AC"): """ Function to train the actor and the critic target networks It has to be after the construction of the replay_memory """ global graph, colors loss, losses, trainnum = 0, [], 0 save_weights() file = open("rewards.txt", "w") file.close() file = open("delta_max.txt", "w") file.close() s_a_file = open("state_action.txt", "w") s_a_file.close() for ep in range(cst_REIL["episodes"]): if ep % 100 == 0: if decay_cri_lr == True: cri.model.optimizer.lr =\ decays.create_decay_fn("linear", curr_step = ep % int(cst_REIL["episodes"] / 500), initial_value = cst_REIL["lr_critics_init"], final_value = cst_REIL["lr_critics_final"] , max_step = int(cst_REIL["episodes"] / 500)) else: cri.model.optimizer.lr = cst_REIL["lr_critics_init"] if decay_act_lr == True: curr_lr_actor = decays.create_decay_fn( "linear", curr_step=ep % int(cst_REIL["episodes"] / 500), initial_value=cst_REIL["lr_actor_init"], final_value=cst_REIL["lr_actor_final"], max_step=int(cst_REIL["episodes"] / 500)) else: curr_lr_actor = cst_REIL["lr_actor_init"] if ep == 0: deque_obj.clear() while deque_obj.size() < cst_REIL["BATCH_SIZE"]: if play_type == "AC": play_with_ACpred(u_init) print ("episodes = %d\t lr_actor_curr = %0.8f \tlr_crits_curr = %0.8f"\ %(ep, curr_lr_actor, cri.model.optimizer.lr)) time.sleep(3) loss = 0 trainnum = 0 # curr_lr_actor = cst_REIL["lr_actor_init"] # cri.model.optimizer.lr = cst_REIL["lr_critics_init"] rew = 0 delta_max, along_reward = [], [] it, totalcounter, iterok = 0, 0, False while it < cst_simu["max_steps"]: states, actions, rewards, next_states, dones = (samples_memories( cst_REIL["BATCH_SIZE"])) delta_number_step = [] y_t = np.asarray([0.0] * cst_REIL["BATCH_SIZE"]) rewards = np.concatenate(rewards) rwrds = np.copy(rewards) # rewards = np.array([10*rr for rr in rwrds]) # print ("states shape : ", states.shape) # print ("actions shape : ", actions.shape) # print ("rewards shape: ", rewards.shape) # print ("dones shape : ", dones.shape) with graph.as_default(): # Q function evaluation on the target graphs target_q_values = cri.target_model.predict( [next_states, act.target_model.predict(next_states)]) target_q_values = target_q_values.reshape( [1, target_q_values.shape[0]])[0] for k in range(cst_REIL["BATCH_SIZE"]): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + cst_REIL["gamma"] * target_q_values[k] with graph.as_default(): # We set lr of the critic network logs = cri.model.train_on_batch([states, actions], y_t) a_for_grad = act.model.predict(states) grad = cri.gradients(states, a_for_grad) act.train(states, grad, learning_rate=curr_lr_actor) act.target_train() cri.target_train() s_a_file = open("state_action.txt", "w") s_a_file.write("Episodes : %d\t Iteration : %d\n" % (ep, it)) for s, a, r in zip(states, actions, rewards): s_a_file.write("States \n{} \n".format(s)) s_a_file.write("Actions \n{} \n".format(a)) s_a_file.write("rewards \n{} \n".format(r)) s_a_file.close() # In this section we decide wheither we continue or not # We use those actor and critic target networks for the next steps_before_change steps save_weights() print ("totalcounter = %d, \t lr_actor = %.6f\t lr_crits = %.6f"\ %(it, curr_lr_actor, cri.model.optimizer.lr)) print(logs / cst_simu["max_steps"]) load_weights() new_actions = act.target_model.predict(states) new_next = [] new_reward = [] # # for s,a in zip(states, actions) : # ns = ACactions.action_with_delta_Un(s,a) # nr = reward(s, ns) # new_next.append(ns) # new_reward.append(nr) # if nr > 1 and nr < 1000 : # done = True # rew = nr # # elif nr > 1000 : # done = True # Game over # rew = -nr # # else : # done = False # On continue si c'est bon # rew = nr # # deque_obj.append((s, a, nr, ns, done)) # ## print deque_obj.size() loss += abs(logs) / cst_simu["max_steps"] it += 1 trainnum += 1 print("Episode = %d :" % ep) print("total loss = %.4f" % loss) losses.append(loss) plt.pause(0.01) if plt.fignum_exists( "Evolution de Loss sur un episodes vs iteration STEP = 100" ) == False: plt.figure( "Evolution de Loss sur un episodes vs iteration STEP = 100") plt.figure("Evolution de Loss sur un episodes vs iteration STEP = 100") plt.semilogy(ep, loss, marker='o', ms=6, linestyle="none", c='navy') plt.pause(0.5) return losses
def play_with_ACpred(u_init, noisy=True): """ Function that plays a certain number of iterations of the game (until it finishes). This function can also be used to construct the rpm. The loop on the episodes is outside of the function. Arguments : ----------- u_init : To set the initialisation """ episode_memory = [] epsilon = 0 total_rew = 0 a_t = np.zeros([1, X.size]) # Pour exploration de l'espace des actions # noise = noiselevel # noiselevel = noise * 0.999 # noise_t = np.zeros([1, action_size]) # Mettre ici un tirage aléatoire d'un entier q entre 1 et max step. # Résoudre avec Roe jusqua la q eme itération, écrire cela en tant que s_t # Puis faire toute la procédure utilisé jusqu'ici. # Répéter ce processus jusqua ce que le replay buffer soit plein global graph while deque_obj.size() < cst_REIL["replay_memory_size"]: int_line = range(0, cst_simu["max_steps"]) curr_step = np.random.choice(int_line) temp_state = u_init for j in range(0, curr_step + 1): temp_state1 = ACactions.action_with_burger(temp_state, cst_simu["r"], f, fprime) if j != curr_step: temp_state = temp_state1 s_t = temp_state with graph.as_default(): a_t_original = act.model.predict(np.array([s_t])) OU_noise = np.zeros_like(a_t_original) if noisy == True: epsilon = decays.create_decay_fn( "linear", curr_step=j, initial_value=cst_REIL['EpsForNoise_init'], final_value=cst_REIL['EpsForNoise_fina'], max_step=cst_simu["max_steps"]) args = { "rp_type": "ornstein-uhlenbeck", "n_action": 1, "rp_theta": 0.1, "rp_mu": 0., "rp_sigma": 0.2, "rp_sigma_min": 0.05 } coeffOU_noise = noise.create_random_process(args).sample() OU_noise = coeffOU_noise * ( np.array([np.random.rand() for rand in range(X.size)]) - 0.5) a_t = a_t_original + OU_noise a_t = a_t.ravel() s_t1 = ACactions.action_with_delta_Un(s_t, a_t) r_t = reward(s_t1, s_t) # print ("state :\n{}".format(s_t)) # print ("action :\n{}".format(a_t)) # print ("next state :\n{}".format(s_t1)) # # time.sleep(5) if r_t > 1 and r_t < 1000: done = True # Game over rew = r_t elif r_t > 1000: done = True # Game over rew = r_t * 10 # Grosse pénalité else: done = False # On continue si c'est bon rew = r_t print("reward :\n{}".format(rew)) deque_obj.append((s_t, a_t, rew, s_t1, done))
def train(): """ Function to train the actor and the critic target networks It has to be after the construction of the replay_memory """ loss = 0 losses = [] trainnum = 0 global graph for ep in range(episodes): loss = 0 trainnum = 0 critics.model.optimizer.lr = decays.create_decay_fn( "linear", curr_step=ep, initial_value=lr_critics_init, final_value=lr_critics_final, max_step=episodes) curr_lr_actor = decays.create_decay_fn("linear", curr_step=ep, initial_value=lr_actor_init, final_value=lr_actor_final, max_step=episodes) if ep % 5 == 0: print ("episodes = %d\t lr_actor_curr = %0.8f \tlr_crits_curr = %0.8f"\ %(ep, curr_lr_actor, critics.model.optimizer.lr)) for it in range(max_steps): states, actions, rewards, next_states, goons = ( samples_memories(BATCH_SIZE)) # Just to test # print ("states shape : ", states.shape) # print ("actions shape : ", actions.shape) # print ("rewards shape: ", rewards.shape) # print ("goons shape : ", goons.shape) y_t = np.asarray([0.0] * BATCH_SIZE) rewards = np.concatenate(rewards) with graph.as_default(): # Q function evaluation on the target graphs target_q_values = critics.target_model.predict( [next_states, actor.target_model.predict(next_states)]) target_q_values = target_q_values.reshape( [1, target_q_values.shape[0]])[0] for k in range(BATCH_SIZE): y_t[k] = rewards[k] + goons[k] * gamma * target_q_values[k] with graph.as_default(): # We set lr of the critic network logs = critics.model.train_on_batch([states, actions], y_t) #(Q-y)**2 a_for_grad = actor.model.predict(states) grad = critics.gradients(states, a_for_grad) actor.train(states, grad, learning_rate=curr_lr_actor) actor.target_train() critics.target_train() # plt.figure("Comparaison") # plt.plot(X, vvals, label='True', c='k') # plt.plot(X, actor.target_model.predict(states[0].reshape(1,-1)).ravel(), label="Process", c='yellow', marker='o', fillstyle="none", linestyle='none') # plt.show() ### plt.legend() loss += logs trainnum += 1 print("Episode = %d :" % ep) print("total loss = %.4f" % loss) losses.append(loss) plt.figure("Evolution de Loss sur un episodes vs iteration") plt.semilogy(ep, loss, marker='o', ms=6, linestyle="none", c='r') plt.pause(0.5) return losses
def play(u_init): """ Function that plays a certain number of iterations of the game (until it finishes). This function can also be used to construct the rpm. The loop on the episodes is outside of the function. Arguments : ----------- u_init : To set the initialisation """ episode_memory = [] s_t = u_init total_rew = 0 a_t = np.zeros([1, s_t.size]) # Pour exploration de l'espace des actions # noise = noiselevel # noiselevel = noise * 0.999 # noise_t = np.zeros([1, action_size]) for j in range(max_steps): global graph with graph.as_default(): a_t_original = actor.model.predict(np.array([s_t])) epsilon = decays.create_decay_fn("linear", curr_step=j, initial_value=EpsForNoise_init, final_value=EpsForNoise_fina, max_step=max_steps) if j % 150 == 0: print("steps = %d\t eps = %0.8f" % (j, epsilon)) args = { "rp_type": "ornstein-uhlenbeck", "n_action": 1, "rp_theta": 0.1, "rp_mu": 0., "rp_sigma": 0.2, "rp_sigma_min": 0.05 } a_t = a_t_original + epsilon * noise.create_random_process( args).sample() a_t = a_t.ravel() a_tt = np.copy(a_t) for a in range(len(a_t)): if a_tt[a] > 1.: a_tt[a] = 1. elif a_tt[a] < -1.: a_tt[a] = -1. else: pass a_t = np.array([a for a in a_tt]) s_t1 = action_with_delta_Un(s_t, a_t) # return s_t, a_t, st_1 r_t = reward(s_t1, s_t) # print ("state :\n{}".format(s_t)) # print ("action :\n{}".format(a_t)) # print ("reward :\n{}".format(r_t)) # print ("next state :\n{}".format(s_t1)) if r_t < 0.001: goon = False else: goon = True if len(replay_memory) < replay_memory_size: replay_memory.append((s_t, a_t, r_t, s_t1, goon)) else: if abs(np.random.randn()) > 0.5: replay_memory.popleft() # Pop the leftmost element else: replay_memory.pop() # Pop the rightmost element s_t = s_t1 # print ("next_state :\n{}".format(s_t1)) total_rew += r_t if len(replay_memory) % 150 == 0: print("Memory size = %d" % len(replay_memory))
def play_with_ACpred(u_init, noisy=True): """ Function that plays a certain number of iterations of the game (until it finishes). This function can also be used to construct the rpm. The loop on the episodes is outside of the function. Arguments : ----------- u_init : To set the initialisation """ episode_memory = [] s_t = u_init epsilon = 0 total_rew = 0 a_t = np.zeros([1, s_t.size]) # Pour exploration de l'espace des actions # noise = noiselevel # noiselevel = noise * 0.999 # noise_t = np.zeros([1, action_size]) for j in range(cst_simu["max_steps"]): global graph while deque_obj.size() < cst_REIL["replay_memory_size"]: with graph.as_default(): a_t_original = act.model.predict(np.array([s_t])) OU_noise = np.zeros_like(a_t_original) if noisy == True: epsilon = decays.create_decay_fn( "linear", curr_step=j, initial_value=cst_REIL['EpsForNoise_init'], final_value=cst_REIL['EpsForNoise_fina'], max_step=cst_simu["max_steps"]) args = { "rp_type": "ornstein-uhlenbeck", "n_action": 1, "rp_theta": 0.1, "rp_mu": 0., "rp_sigma": 0.2, "rp_sigma_min": 0.05 } OU_noise = noise.create_random_process(args).sample() a_t = a_t_original + epsilon * OU_noise a_t = a_t.ravel() s_t1 = ACactions.action_with_delta_Un(s_t, a_t) r_t = reward(s_t1, s_t) # print ("state :\n{}".format(s_t)) # print ("action :\n{}".format(a_t)) # print ("next state :\n{}".format(s_t1)) # # time.sleep(5) if r_t > 1 and r_t < 1000: done = True # Game over rew = r_t elif r_t > 1000: done = True # Game over rew = -r_t # Grosse pénalité else: done = False # On continue si c'est bon rew = r_t print("reward :\n{}".format(rew)) deque_obj.append((s_t, a_t, rew, s_t1, done)) s_t = s_t + noise.create_random_process(args).sample()
def play_with_burger(u_init): """ Function that plays a certain number of iterations of the game (until it finishes). This function can also be used to construct the rpm. The loop on the episodes is outside of the function. we use timestep_roe to provide next steps Arguments : ----------- u_init : To set the initialisation """ episode_memory = [] s_t = u_init total_rew = 0 a_t = np.zeros([1, s_t.size]) for j in range(max_steps): s_t1 = action_with_burger(s_t) a_t_original = np.array([s_t1[i] - s_t[i] for i in range(len(s_t))]) epsilon = decays.create_decay_fn("linear", curr_step=j, initial_value=EpsForNoise_init, final_value=EpsForNoise_fina, max_step=max_steps) # if j % 200 == 0 : # print ("steps = %d\t eps = %0.8f" %(j, epsilon)) args = { "rp_type": "ornstein-uhlenbeck", "n_action": 1, "rp_theta": 0.1, "rp_mu": 0., "rp_sigma": 0.2, "rp_sigma_min": 0.05 } a_t = a_t_original + epsilon * noise.create_random_process( args).sample() a_t = a_t.ravel() # a_tt = np.copy(a_t) # for a in range(len(a_t)) : # if a_tt[a] > 1. : # a_tt[a] = 1. # elif a_tt[a] < -1. : # a_tt[a] = -1. # else : # pass # a_t = np.array([a for a in a_tt]) s_t1 = action_with_delta_Un(s_t, a_t) # return s_t, a_t, st_1 r_t = reward(s_t1, s_t) # print ("state :\n{}".format(s_t)) # print ("action :\n{}".format(a_t)) # print ("reward :\n{}".format(r_t)) # print ("next state :\n{}".format(s_t1)) if abs(r_t) < 10: goon = False else: goon = True if len(replay_memory) < replay_memory_size: replay_memory.append((s_t, a_t, r_t, s_t1, goon)) else: if abs(np.random.randn()) > 0.5: replay_memory.popleft() # Pop the leftmost element else: replay_memory.pop() # Pop the rightmost element s_t = s_t1 # print ("next_state :\n{}".format(s_t1)) total_rew += r_t