コード例 #1
0
def train(u_init, play_type="AC"):
    """
    Function to train the actor and the critic target networks
    It has to be after the construction of the replay_memory
    """
    global graph, colors

    loss, losses, trainnum = 0, [], 0
    save_weights()

    file = open("rewards.txt", "w")
    file.close()
    file = open("delta_max.txt", "w")
    file.close()

    s_a_file = open("state_action.txt", "w")
    s_a_file.close()

    for ep in range(cst_REIL["episodes"]):
        if ep % 100 == 0:
            if decay_cri_lr == True:
                cri.model.optimizer.lr =\
                    decays.create_decay_fn("linear",
                                           curr_step = ep % int(cst_REIL["episodes"] / 500),
                                           initial_value = cst_REIL["lr_critics_init"],
                                           final_value = cst_REIL["lr_critics_final"] ,
                                           max_step = int(cst_REIL["episodes"] / 500))
            else:
                cri.model.optimizer.lr = cst_REIL["lr_critics_init"]

            if decay_act_lr == True:
                curr_lr_actor = decays.create_decay_fn(
                    "linear",
                    curr_step=ep % int(cst_REIL["episodes"] / 500),
                    initial_value=cst_REIL["lr_actor_init"],
                    final_value=cst_REIL["lr_actor_final"],
                    max_step=int(cst_REIL["episodes"] / 500))
            else:
                curr_lr_actor = cst_REIL["lr_actor_init"]

            if ep == 0:
                deque_obj.clear()
                while deque_obj.size() < cst_REIL["BATCH_SIZE"]:
                    if play_type == "AC":
                        play_with_ACpred(u_init)

        print ("episodes = %d\t lr_actor_curr = %0.8f \tlr_crits_curr = %0.8f"\
                    %(ep, curr_lr_actor, cri.model.optimizer.lr))
        time.sleep(3)

        loss = 0
        trainnum = 0

        #        curr_lr_actor = cst_REIL["lr_actor_init"]
        #        cri.model.optimizer.lr = cst_REIL["lr_critics_init"]

        rew = 0
        delta_max, along_reward = [], []

        it, totalcounter, iterok = 0, 0, False

        while it < cst_simu["max_steps"]:
            states, actions, rewards, next_states, dones = (samples_memories(
                cst_REIL["BATCH_SIZE"]))
            delta_number_step = []

            y_t = np.asarray([0.0] * cst_REIL["BATCH_SIZE"])
            rewards = np.concatenate(rewards)
            rwrds = np.copy(rewards)
            #            rewards = np.array([10*rr for rr in rwrds])

            #            print ("states shape : ", states.shape)
            #            print ("actions shape : ", actions.shape)
            #            print ("rewards shape: ", rewards.shape)
            #            print ("dones shape  : ", dones.shape)

            with graph.as_default():
                # Q function evaluation on the target graphs
                target_q_values = cri.target_model.predict(
                    [next_states,
                     act.target_model.predict(next_states)])

            target_q_values = target_q_values.reshape(
                [1, target_q_values.shape[0]])[0]

            for k in range(cst_REIL["BATCH_SIZE"]):

                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + cst_REIL["gamma"] * target_q_values[k]

            with graph.as_default():
                # We set lr of the critic network

                logs = cri.model.train_on_batch([states, actions], y_t)

                a_for_grad = act.model.predict(states)
                grad = cri.gradients(states, a_for_grad)

                act.train(states, grad, learning_rate=curr_lr_actor)

                act.target_train()
                cri.target_train()

                s_a_file = open("state_action.txt", "w")
                s_a_file.write("Episodes : %d\t Iteration : %d\n" % (ep, it))
                for s, a, r in zip(states, actions, rewards):
                    s_a_file.write("States \n{} \n".format(s))
                    s_a_file.write("Actions \n{} \n".format(a))
                    s_a_file.write("rewards \n{} \n".format(r))

                s_a_file.close()

                # In this section we decide wheither we continue or not
                # We use those actor and critic target networks for the next steps_before_change steps

                save_weights()
                print ("totalcounter = %d, \t lr_actor = %.6f\t lr_crits = %.6f"\
                                        %(it, curr_lr_actor, cri.model.optimizer.lr))

                print(logs / cst_simu["max_steps"])
                load_weights()

                new_actions = act.target_model.predict(states)
                new_next = []
                new_reward = []
#
#                for s,a in zip(states, actions) :
#                    ns = ACactions.action_with_delta_Un(s,a)
#                    nr = reward(s, ns)
#                    new_next.append(ns)
#                    new_reward.append(nr)

#                    if nr > 1 and nr < 1000 :
#                        done = True
#                        rew = nr
#
#                    elif nr > 1000 :
#                        done = True # Game over
#                        rew = -nr
#
#                    else :
#                        done = False # On continue si c'est bon
#                        rew = nr
#
#                    deque_obj.append((s, a, nr, ns, done))
#
##                    print deque_obj.size()

            loss += abs(logs) / cst_simu["max_steps"]
            it += 1
            trainnum += 1
        print("Episode = %d :" % ep)
        print("total loss = %.4f" % loss)

        losses.append(loss)

        plt.pause(0.01)
        if plt.fignum_exists(
                "Evolution de Loss sur un episodes vs iteration STEP = 100"
        ) == False:
            plt.figure(
                "Evolution de Loss sur un episodes vs iteration STEP = 100")

        plt.figure("Evolution de Loss sur un episodes vs iteration STEP = 100")
        plt.semilogy(ep, loss, marker='o', ms=6, linestyle="none", c='navy')
        plt.pause(0.5)

    return losses
コード例 #2
0
def play_with_ACpred(u_init, noisy=True):
    """
    Function that plays a certain number of iterations of the game (until it finishes).
    This function can also be used to construct the rpm.
    The loop on the episodes is outside of the function. 
    
    Arguments :
    -----------
    u_init : To set the initialisation
    """
    episode_memory = []

    epsilon = 0
    total_rew = 0
    a_t = np.zeros([1, X.size])

    # Pour exploration de l'espace des actions
    #    noise = noiselevel
    #    noiselevel = noise * 0.999
    #    noise_t = np.zeros([1, action_size])

    # Mettre ici un tirage aléatoire d'un entier q entre 1 et max step.
    # Résoudre avec Roe jusqua la q eme itération, écrire cela en tant que s_t
    # Puis faire toute la procédure utilisé jusqu'ici.
    # Répéter ce processus jusqua ce que le replay buffer soit plein

    global graph
    while deque_obj.size() < cst_REIL["replay_memory_size"]:

        int_line = range(0, cst_simu["max_steps"])
        curr_step = np.random.choice(int_line)

        temp_state = u_init

        for j in range(0, curr_step + 1):
            temp_state1 = ACactions.action_with_burger(temp_state,
                                                       cst_simu["r"], f,
                                                       fprime)

            if j != curr_step:
                temp_state = temp_state1

        s_t = temp_state

        with graph.as_default():
            a_t_original = act.model.predict(np.array([s_t]))

            OU_noise = np.zeros_like(a_t_original)

            if noisy == True:
                epsilon = decays.create_decay_fn(
                    "linear",
                    curr_step=j,
                    initial_value=cst_REIL['EpsForNoise_init'],
                    final_value=cst_REIL['EpsForNoise_fina'],
                    max_step=cst_simu["max_steps"])

                args = {
                    "rp_type": "ornstein-uhlenbeck",
                    "n_action": 1,
                    "rp_theta": 0.1,
                    "rp_mu": 0.,
                    "rp_sigma": 0.2,
                    "rp_sigma_min": 0.05
                }

                coeffOU_noise = noise.create_random_process(args).sample()
                OU_noise = coeffOU_noise * (
                    np.array([np.random.rand()
                              for rand in range(X.size)]) - 0.5)

            a_t = a_t_original + OU_noise
            a_t = a_t.ravel()

            s_t1 = ACactions.action_with_delta_Un(s_t, a_t)

            r_t = reward(s_t1, s_t)

            #        print ("state :\n{}".format(s_t))
            #        print ("action :\n{}".format(a_t))
            #        print ("next state :\n{}".format(s_t1))
            #
            #        time.sleep(5)

            if r_t > 1 and r_t < 1000:
                done = True  # Game over
                rew = r_t

            elif r_t > 1000:
                done = True  # Game over
                rew = r_t * 10  # Grosse pénalité

            else:
                done = False  # On continue si c'est bon
                rew = r_t

            print("reward :\n{}".format(rew))
            deque_obj.append((s_t, a_t, rew, s_t1, done))
コード例 #3
0
ファイル: Burger_act_crits_backup.py プロジェクト: nsaura/ML
def train():
    """
    Function to train the actor and the critic target networks
    It has to be after the construction of the replay_memory
    """
    loss = 0
    losses = []
    trainnum = 0
    global graph
    for ep in range(episodes):
        loss = 0
        trainnum = 0

        critics.model.optimizer.lr = decays.create_decay_fn(
            "linear",
            curr_step=ep,
            initial_value=lr_critics_init,
            final_value=lr_critics_final,
            max_step=episodes)
        curr_lr_actor = decays.create_decay_fn("linear",
                                               curr_step=ep,
                                               initial_value=lr_actor_init,
                                               final_value=lr_actor_final,
                                               max_step=episodes)
        if ep % 5 == 0:
            print ("episodes = %d\t lr_actor_curr = %0.8f \tlr_crits_curr = %0.8f"\
                                %(ep, curr_lr_actor, critics.model.optimizer.lr))

        for it in range(max_steps):
            states, actions, rewards, next_states, goons = (
                samples_memories(BATCH_SIZE))
            #       Just to test
            #        print ("states shape : ", states.shape)
            #        print ("actions shape : ", actions.shape)
            #        print ("rewards shape: ", rewards.shape)
            #        print ("goons shape  : ", goons.shape)
            y_t = np.asarray([0.0] * BATCH_SIZE)
            rewards = np.concatenate(rewards)

            with graph.as_default():
                # Q function evaluation on the target graphs
                target_q_values = critics.target_model.predict(
                    [next_states,
                     actor.target_model.predict(next_states)])
            target_q_values = target_q_values.reshape(
                [1, target_q_values.shape[0]])[0]

            for k in range(BATCH_SIZE):
                y_t[k] = rewards[k] + goons[k] * gamma * target_q_values[k]

            with graph.as_default():
                # We set lr of the critic network

                logs = critics.model.train_on_batch([states, actions],
                                                    y_t)  #(Q-y)**2

                a_for_grad = actor.model.predict(states)
                grad = critics.gradients(states, a_for_grad)

                actor.train(states, grad, learning_rate=curr_lr_actor)

                actor.target_train()
                critics.target_train()

    #            plt.figure("Comparaison")
    #            plt.plot(X, vvals, label='True', c='k')
    #            plt.plot(X, actor.target_model.predict(states[0].reshape(1,-1)).ravel(), label="Process", c='yellow', marker='o', fillstyle="none", linestyle='none')
    #            plt.show()
    ###            plt.legend()
            loss += logs

            trainnum += 1
        print("Episode = %d :" % ep)
        print("total loss = %.4f" % loss)

        losses.append(loss)

        plt.figure("Evolution de Loss sur un episodes vs iteration")
        plt.semilogy(ep, loss, marker='o', ms=6, linestyle="none", c='r')
        plt.pause(0.5)

    return losses
コード例 #4
0
ファイル: Burger_act_crits_backup.py プロジェクト: nsaura/ML
def play(u_init):
    """
    Function that plays a certain number of iterations of the game (until it finishes).
    This function can also be used to construct the rpm.
    The loop on the episodes is outside of the function. 
    
    Arguments :
    -----------
    u_init : To set the initialisation
    """
    episode_memory = []
    s_t = u_init

    total_rew = 0
    a_t = np.zeros([1, s_t.size])

    # Pour exploration de l'espace des actions
    #    noise = noiselevel
    #    noiselevel = noise * 0.999
    #    noise_t = np.zeros([1, action_size])

    for j in range(max_steps):
        global graph
        with graph.as_default():
            a_t_original = actor.model.predict(np.array([s_t]))

        epsilon = decays.create_decay_fn("linear",
                                         curr_step=j,
                                         initial_value=EpsForNoise_init,
                                         final_value=EpsForNoise_fina,
                                         max_step=max_steps)

        if j % 150 == 0:
            print("steps = %d\t eps = %0.8f" % (j, epsilon))

        args = {
            "rp_type": "ornstein-uhlenbeck",
            "n_action": 1,
            "rp_theta": 0.1,
            "rp_mu": 0.,
            "rp_sigma": 0.2,
            "rp_sigma_min": 0.05
        }

        a_t = a_t_original + epsilon * noise.create_random_process(
            args).sample()
        a_t = a_t.ravel()

        a_tt = np.copy(a_t)
        for a in range(len(a_t)):
            if a_tt[a] > 1.:
                a_tt[a] = 1.
            elif a_tt[a] < -1.:
                a_tt[a] = -1.
            else:
                pass
        a_t = np.array([a for a in a_tt])
        s_t1 = action_with_delta_Un(s_t, a_t)

        #        return s_t, a_t, st_1

        r_t = reward(s_t1, s_t)

        #        print ("state :\n{}".format(s_t))
        #        print ("action :\n{}".format(a_t))
        #        print ("reward :\n{}".format(r_t))
        #        print ("next state :\n{}".format(s_t1))

        if r_t < 0.001:
            goon = False
        else:
            goon = True

        if len(replay_memory) < replay_memory_size:
            replay_memory.append((s_t, a_t, r_t, s_t1, goon))

        else:
            if abs(np.random.randn()) > 0.5:
                replay_memory.popleft()  # Pop the leftmost element
            else:
                replay_memory.pop()  # Pop the rightmost element

        s_t = s_t1
        #        print ("next_state :\n{}".format(s_t1))

        total_rew += r_t

        if len(replay_memory) % 150 == 0:
            print("Memory size = %d" % len(replay_memory))
コード例 #5
0
ファイル: hybrid_ddpg.py プロジェクト: nsaura/ML
def play_with_ACpred(u_init, noisy=True):
    """
    Function that plays a certain number of iterations of the game (until it finishes).
    This function can also be used to construct the rpm.
    The loop on the episodes is outside of the function. 
    
    Arguments :
    -----------
    u_init : To set the initialisation
    """
    episode_memory = []
    s_t = u_init

    epsilon = 0
    total_rew = 0
    a_t = np.zeros([1, s_t.size])

    # Pour exploration de l'espace des actions
    #    noise = noiselevel
    #    noiselevel = noise * 0.999
    #    noise_t = np.zeros([1, action_size])

    for j in range(cst_simu["max_steps"]):
        global graph
        while deque_obj.size() < cst_REIL["replay_memory_size"]:
            with graph.as_default():
                a_t_original = act.model.predict(np.array([s_t]))

            OU_noise = np.zeros_like(a_t_original)

            if noisy == True:
                epsilon = decays.create_decay_fn(
                    "linear",
                    curr_step=j,
                    initial_value=cst_REIL['EpsForNoise_init'],
                    final_value=cst_REIL['EpsForNoise_fina'],
                    max_step=cst_simu["max_steps"])

                args = {
                    "rp_type": "ornstein-uhlenbeck",
                    "n_action": 1,
                    "rp_theta": 0.1,
                    "rp_mu": 0.,
                    "rp_sigma": 0.2,
                    "rp_sigma_min": 0.05
                }

                OU_noise = noise.create_random_process(args).sample()

            a_t = a_t_original + epsilon * OU_noise
            a_t = a_t.ravel()

            s_t1 = ACactions.action_with_delta_Un(s_t, a_t)

            r_t = reward(s_t1, s_t)

            #        print ("state :\n{}".format(s_t))
            #        print ("action :\n{}".format(a_t))
            #        print ("next state :\n{}".format(s_t1))
            #
            #        time.sleep(5)

            if r_t > 1 and r_t < 1000:
                done = True  # Game over
                rew = r_t

            elif r_t > 1000:
                done = True  # Game over
                rew = -r_t  # Grosse pénalité

            else:
                done = False  # On continue si c'est bon
                rew = r_t

            print("reward :\n{}".format(rew))
            deque_obj.append((s_t, a_t, rew, s_t1, done))

            s_t = s_t + noise.create_random_process(args).sample()
コード例 #6
0
ファイル: Burger_act_crits.py プロジェクト: nsaura/ML
def play_with_burger(u_init):
    """
    Function that plays a certain number of iterations of the game (until it finishes).
    This function can also be used to construct the rpm.
    The loop on the episodes is outside of the function. 
    
    we use timestep_roe to provide next steps
    
    Arguments :
    -----------
    u_init : To set the initialisation
    """
    episode_memory = []
    s_t = u_init

    total_rew = 0
    a_t = np.zeros([1, s_t.size])

    for j in range(max_steps):
        s_t1 = action_with_burger(s_t)
        a_t_original = np.array([s_t1[i] - s_t[i] for i in range(len(s_t))])

        epsilon = decays.create_decay_fn("linear",
                                         curr_step=j,
                                         initial_value=EpsForNoise_init,
                                         final_value=EpsForNoise_fina,
                                         max_step=max_steps)

        #        if j % 200 == 0 :
        #            print ("steps = %d\t eps = %0.8f" %(j, epsilon))

        args = {
            "rp_type": "ornstein-uhlenbeck",
            "n_action": 1,
            "rp_theta": 0.1,
            "rp_mu": 0.,
            "rp_sigma": 0.2,
            "rp_sigma_min": 0.05
        }

        a_t = a_t_original + epsilon * noise.create_random_process(
            args).sample()
        a_t = a_t.ravel()

        #        a_tt = np.copy(a_t)
        #        for a in range(len(a_t)) :
        #            if a_tt[a] > 1. :
        #                a_tt[a] = 1.
        #            elif a_tt[a] < -1. :
        #                a_tt[a] = -1.
        #            else :
        #                pass
        #        a_t = np.array([a for a in a_tt])
        s_t1 = action_with_delta_Un(s_t, a_t)

        #        return s_t, a_t, st_1

        r_t = reward(s_t1, s_t)

        #        print ("state :\n{}".format(s_t))
        #        print ("action :\n{}".format(a_t))
        #        print ("reward :\n{}".format(r_t))
        #        print ("next state :\n{}".format(s_t1))

        if abs(r_t) < 10:
            goon = False
        else:
            goon = True

        if len(replay_memory) < replay_memory_size:
            replay_memory.append((s_t, a_t, r_t, s_t1, goon))

        else:
            if abs(np.random.randn()) > 0.5:
                replay_memory.popleft()  # Pop the leftmost element
            else:
                replay_memory.pop()  # Pop the rightmost element

        s_t = s_t1
        #        print ("next_state :\n{}".format(s_t1))

        total_rew += r_t