def reward_value_constrained(v_log_counts_future, v_log_counts_old,\
                 KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old):

    final_reward = 0.0

    reward_s = reward_intermediate(v_log_counts_future, v_log_counts_old)

    #originally, does nothing, but turns to penalty value when regulating a new reaction
    psi = 1.0

    #reward_s = e_val_old-e_val_future
    num_regulated_new = np.sum(E_Regulation_new == 1)
    num_regulated_old = np.sum(E_Regulation_old == 1)

    if (num_regulated_new != num_regulated_old):
        #then you regulated a new reaction:
        psi = penalty_reward_scalar

    if ((reward_s < 0.0)):
        final_reward = penalty_exclusion_reward

    if (reward_s >= 0.0):
        final_reward = psi * reward_s
        #if negative (-0.01) -> take fastest path
        #if positive (0.01) -> take slowest path

    if ((np.max(v_log_counts_future - target_v_log_counts) <= 0.0)):
        #The final reward is meant to maximize the EPR value. However, there was some residual error in ds_metab
        #that must be taken into account. We therefore add the last reward_s to the EPR value.

        epr_future = max_entropy_functions.entropy_production_rate(
            KQ_f_new, KQ_r_new, E_Regulation_new)
        final_reward = (1.0) * epr_future + psi * reward_s

    return final_reward
def reward_value(v_log_counts_future, v_log_counts_old,\
                 KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old):
    final_reward=0.0

    #here we use the mean for the scaling. The logic is as follows:
    #https://www.xarg.org/2016/06/the-log-sum-exp-trick-in-machine-learning/

    scale_old_max = np.max(v_log_counts_old - target_v_log_counts)
    scale_old_min = np.min(v_log_counts_old - target_v_log_counts)
    scale_old = (scale_old_max + scale_old_min)/2.0
    
    e_val_old = np.exp(v_log_counts_old - target_v_log_counts - scale_old)
    e_val_old = scale_old + np.log(np.sum(e_val_old))
      
    scale_future_max = np.max(v_log_counts_future - target_v_log_counts)
    scale_future_min = np.min(v_log_counts_future - target_v_log_counts)
    scale_future = (scale_future_max + scale_future_min)/2.0

    e_val_future = np.exp(v_log_counts_future - target_v_log_counts - scale_future)
    e_val_future = scale_future  + np.log(np.sum(e_val_future))

    reward_s = (e_val_old - e_val_future)
    final_reward=reward_s     
    if ((  scale_future_max <=0.0)):
        #The final reward is meant to maximize the EPR value. However, there was some residual error
        #that must be taken into account. We therefore add the last reward_s to the EPR value. 
        
        epr_future = max_entropy_functions.entropy_production_rate(KQ_f_new, KQ_r_new, E_Regulation_new)
        final_reward = 1.0 * epr_future + reward_s 
        
    return final_reward
def reward_value(v_log_counts_future, v_log_counts_old,\
                 KQ_f_new, KQ_r_new, E_Regulation_new, E_Regulation_old):

    final_reward = 0.0

    reward_s = reward_intermediate(v_log_counts_future, v_log_counts_old)

    final_reward = reward_s
    if ((np.max(v_log_counts_future - target_v_log_counts) <= 0.0)):
        #The final reward is meant to maximize the EPR value. However, there was some residual error in ds_metab
        #that must be taken into account. We therefore add the last reward_s to the EPR value.

        epr_future = max_entropy_functions.entropy_production_rate(
            KQ_f_new, KQ_r_new, E_Regulation_new)
        final_reward = (1.0) * epr_future + reward_s

    return final_reward
def sarsa_n(nn_model, loss_fn, optimizer, scheduler, state_sample, n_back_step,
            epsilon_greedy):
    total_time_cpu = 0
    total_time_nn = 0
    #reset for each episode. policy will add
    random_steps_taken = 0
    nn_steps_taken = 0

    final_state = []
    final_KQ_f = []
    final_KQ_r = []
    reached_terminal_state = False
    average_loss = []

    final_reward = 0
    sum_reward_episode = 0
    end_of_path = 5000  #this is the maximum length a path can take
    KQ_f_matrix = np.zeros(shape=(num_rxns, end_of_path + 1))
    KQ_r_matrix = np.zeros(shape=(num_rxns, end_of_path + 1))
    states_matrix = np.zeros(shape=(num_rxns, end_of_path + 1))
    delta_S_metab_matrix = np.zeros(shape=(nvar, end_of_path + 1))
    v_log_counts_matrix = np.zeros(shape=(nvar, end_of_path + 1))

    states_matrix[:, 0] = state_sample

    res_lsq = least_squares(max_entropy_functions.derivatives,
                            v_log_counts_static,
                            method='lm',
                            xtol=1e-15,
                            args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat,
                                  delta_increment_for_small_concs,
                                  Keq_constant, states_matrix[:, 0]))

    v_log_counts_matrix[:, 0] = res_lsq.x.copy()
    log_metabolites = np.append(v_log_counts_matrix[:, 0], f_log_counts)

    rxn_flux_init = max_entropy_functions.oddsDiff(
        v_log_counts_matrix[:, 0], f_log_counts, mu0, S_mat, R_back_mat, P_mat,
        delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])
    KQ_f_matrix[:, 0] = max_entropy_functions.odds(
        log_metabolites, mu0, S_mat, R_back_mat, P_mat,
        delta_increment_for_small_concs, Keq_constant)

    Keq_inverse = np.power(Keq_constant, -1)
    KQ_r_matrix[:, 0] = max_entropy_functions.odds(
        log_metabolites, mu0, -S_mat, P_mat, R_back_mat,
        delta_increment_for_small_concs, Keq_inverse, -1)

    delta_S_metab_matrix[:, 0] = max_entropy_functions.calc_deltaS_metab(
        v_log_counts_matrix[:, 0], target_v_log_counts)

    reward_vec = np.zeros(end_of_path + 1)

    reward_vec[0] = 0.0
    rxn_flux_path = rxn_flux_init.copy()

    for t in range(0, end_of_path):
        if (t < end_of_path):
            #This represents the choice from the current policy.
            [React_Choice,reward_vec[t+1],\
            KQ_f_matrix[:,t+1], KQ_r_matrix[:,t+1],\
            v_log_counts_matrix[:,t+1],\
            states_matrix[:,t+1],\
            delta_S_metab_matrix[:,t+1],\
            used_random_step,time_cpu,time_nn] = policy_function(nn_model, states_matrix[:,t], v_log_counts_matrix[:,t], epsilon_greedy)#regulate each reaction.

            total_time_cpu += time_cpu
            total_time_nn += time_nn

            if (used_random_step):
                random_steps_taken += 1
            else:
                nn_steps_taken += 1

            if (React_Choice == -1):
                print("bad reaction choice, using action = -1")
                break

            rxn_flux_path = max_entropy_functions.oddsDiff(
                v_log_counts_matrix[:, t + 1], f_log_counts, mu0, S_mat,
                R_back_mat, P_mat, delta_increment_for_small_concs,
                Keq_constant, states_matrix[:, t + 1])
            if (np.max(rxn_flux_path) < 1.0):
                print("draining flux")
                break
            epr_path = max_entropy_functions.entropy_production_rate(
                KQ_f_matrix[:, t + 1], KQ_r_matrix[:, t + 1],
                states_matrix[:, t + 1])
            sum_reward_episode += reward_vec[t + 1]

            current_state = states_matrix[:, t + 1].copy()

            #We stop the path if we have no more positive loss function values, or if we revisit a state.
            if ((delta_S_metab_matrix[:, t + 1] <= 0.0).all()):
                end_of_path = t + 1  #stops simulation at step t+1

                reached_terminal_state = True
                final_state = states_matrix[:, t + 1].copy()
                final_KQ_f = KQ_f_matrix[:, t + 1].copy()
                final_KQ_r = KQ_r_matrix[:, t + 1].copy()
                final_reward = epr_path
                print(
                    "**************************************Path Length ds<0******************************************"
                )
                print(end_of_path)
                print("Final STATE")
                print(states_matrix[:, t + 1])
                print(rxn_flux_path)
                print("original epr")
                print(epr_path)
                print("all rewards")
                print(reward_vec[0:t + 1])

        ##BEGIN LEARNING
        tau = t - n_back_step + 1

        if (tau >= 0):
            #breakpoint()
            estimate_value = torch.zeros(1, device=device)

            for i in range(tau + 1, min(tau + n_back_step, end_of_path) + 1):
                estimate_value += (gamma**(i - tau - 1)) * reward_vec[i]

            if ((tau + n_back_step) < end_of_path):
                begin_nn = time.time()
                value_tau_n = state_value(
                    nn_model,
                    torch.from_numpy(
                        states_matrix[:,
                                      tau + n_back_step]).float().to(device))
                end_nn = time.time()
                total_time_nn += end_nn - begin_nn
                estimate_value += (gamma**(n_back_step)) * value_tau_n

            begin_nn = time.time()
            value_tau = state_value(
                nn_model,
                torch.from_numpy(states_matrix[:, tau]).float().to(device))
            end_nn = time.time()
            total_time_nn += end_nn - begin_nn

            if (value_tau.requires_grad == False):
                breakpoint()
            if (estimate_value.requires_grad == True):
                estimate_value.detach_()

            #WARNING
            #loss ordering should be input with requires_grad == True,
            #followed by target with requires_grad == False
            #breakpoint()
            begin_nn = time.time()
            loss = loss_fn(value_tau, estimate_value)  #MSE

            optimizer.zero_grad()
            loss.backward()
            clipping_value = 1.0
            torch.nn.utils.clip_grad_norm_(nn_model.parameters(),
                                           clipping_value)

            optimizer.step()
            end_nn = time.time()
            total_time_nn += end_nn - begin_nn
            average_loss.append(loss.item())

        if (tau >= (end_of_path - 1)):
            break

    #after episode is finished, take average loss
    average_loss_episode = np.mean(average_loss)
    print("index of max error on path")
    print(average_loss.index(max(average_loss)))
    return [sum_reward_episode, average_loss_episode,max(average_loss),final_reward, final_state, final_KQ_f,final_KQ_r,\
            reached_terminal_state, random_steps_taken,nn_steps_taken]
Exemple #5
0
    #make calculations to regulate
    rxn_flux = max_entropy_functions.oddsDiff(v_log_counts, f_log_counts, mu0,
                                              S_mat, R_back_mat, P_mat,
                                              delta_increment_for_small_concs,
                                              Keq_constant, E_regulation)

    KQ_f = max_entropy_functions.odds(log_metabolites, mu0, S_mat, R_back_mat,
                                      P_mat, delta_increment_for_small_concs,
                                      Keq_constant)
    Keq_inverse = np.power(Keq_constant, -1)
    KQ_r = max_entropy_functions.odds(log_metabolites, mu0, -S_mat, P_mat,
                                      R_back_mat,
                                      delta_increment_for_small_concs,
                                      Keq_inverse, -1)

    epr = max_entropy_functions.entropy_production_rate(
        KQ_f, KQ_r, E_regulation)

    delta_S_metab = max_entropy_functions.calc_deltaS_metab(
        v_log_counts, target_v_log_counts)

    delta_S = max_entropy_functions.calc_deltaS(v_log_counts,
                                                target_v_log_counts,
                                                f_log_counts, S_mat, KQ_f)

    [RR,
     Jac] = max_entropy_functions.calc_Jac2(v_log_counts, f_log_counts, S_mat,
                                            delta_increment_for_small_concs,
                                            KQ_f, KQ_r, E_regulation)
    A = max_entropy_functions.calc_A(v_log_counts, f_log_counts, S_mat, Jac,
                                     E_regulation)
def sarsa_n(nn_model, loss_fn, optimizer, scheduler, state_sample, n_back_step,
            epsilon_greedy):

    #reset for each episode. policy will add
    random_steps_taken = 0
    nn_steps_taken = 0
    maximum_predicted_value = 0
    layer_weight = torch.zeros(1, device=device)

    final_state = []
    final_KQ_f = []
    final_KQ_r = []
    reached_terminal_state = False
    average_loss = []

    final_reward = 0
    sum_reward_episode = 0
    end_of_path = 1000  #this is the maximum length a path can take

    states_matrix = np.zeros(shape=(num_rxns, end_of_path + 1))
    states_matrix[:, 0] = state_sample

    res_lsq = least_squares(max_entropy_functions.derivatives,
                            v_log_counts_static,
                            method=Method1,
                            bounds=(-500, 500),
                            xtol=1e-15,
                            args=(f_log_counts, mu0, S_mat, R_back_mat, P_mat,
                                  delta_increment_for_small_concs,
                                  Keq_constant, states_matrix[:, 0]))
    if (res_lsq.success == False):
        res_lsq = least_squares(max_entropy_functions.derivatives,
                                v_log_counts_static,
                                method=Method2,
                                xtol=1e-15,
                                args=(f_log_counts, mu0, S_mat, R_back_mat,
                                      P_mat, delta_increment_for_small_concs,
                                      Keq_constant, states_matrix[:, 0]))
        if (res_lsq.success == False):
            res_lsq = least_squares(max_entropy_functions.derivatives,
                                    v_log_counts_static,
                                    method=Method3,
                                    xtol=1e-15,
                                    args=(f_log_counts, mu0, S_mat, R_back_mat,
                                          P_mat,
                                          delta_increment_for_small_concs,
                                          Keq_constant, states_matrix[:, 0]))

    v_log_counts_current = res_lsq.x.copy()
    log_metabolites = np.append(v_log_counts_current, f_log_counts)

    rxn_flux_init = max_entropy_functions.oddsDiff(
        v_log_counts_current, f_log_counts, mu0, S_mat, R_back_mat, P_mat,
        delta_increment_for_small_concs, Keq_constant, states_matrix[:, 0])
    KQ_f_current = max_entropy_functions.odds(log_metabolites, mu0, S_mat,
                                              R_back_mat, P_mat,
                                              delta_increment_for_small_concs,
                                              Keq_constant)

    Keq_inverse = np.power(Keq_constant, -1)
    KQ_r_current = max_entropy_functions.odds(log_metabolites, mu0, -S_mat,
                                              P_mat, R_back_mat,
                                              delta_increment_for_small_concs,
                                              Keq_inverse, -1)

    delta_S_metab_current = max_entropy_functions.calc_deltaS_metab(
        v_log_counts_current, target_v_log_counts)

    #[ccc,fcc] = max_entropy_functions.conc_flux_control_coeff(nvar, A_init, S_mat, rxn_flux_init, RR)

    reward_vec = np.zeros(end_of_path + 1)

    reward_vec[0] = 0.0
    rxn_flux_path = rxn_flux_init.copy()
    #A_path = A_init.copy()

    for t in range(0, end_of_path):

        if (t < end_of_path):
            #This represents the choice from the current policy.
            [React_Choice,reward_vec[t+1],\
            KQ_f_current, KQ_r_current,\
            v_log_counts_current,\
            states_matrix[:,t+1],\
            delta_S_metab_current,\
            used_random_step] = policy_function(nn_model, states_matrix[:,t], v_log_counts_current, epsilon_greedy)#regulate each reaction.

            if (used_random_step):
                random_steps_taken += 1
            else:
                nn_steps_taken += 1

            if (React_Choice == -1):
                print("out of rewards, final state")
                print(states_matrix[:, t + 1])
                break

            rxn_flux_path = max_entropy_functions.oddsDiff(
                v_log_counts_current, f_log_counts, mu0, S_mat, R_back_mat,
                P_mat, delta_increment_for_small_concs, Keq_constant,
                states_matrix[:, t + 1])
            epr_path = max_entropy_functions.entropy_production_rate(
                KQ_f_current, KQ_r_current, states_matrix[:, t + 1])
            sum_reward_episode += reward_vec[t + 1]

            final_state = states_matrix[:, t + 1].copy()
            #We stop the path if we have no more positive loss function values, or if we revisit a state.
            if ((delta_S_metab_current <= 0.0).all()):
                end_of_path = t + 1  #stops simulation at step t+1

                reached_terminal_state = True
                final_state = states_matrix[:, t + 1].copy()
                final_KQ_f = KQ_f_current.copy()
                final_KQ_r = KQ_r_current.copy()
                final_reward = epr_path
                #breakpoint()
                print(
                    "**************************************Path Length ds<0******************************************"
                )
                print(end_of_path)
                print("Final STATE")
                print(states_matrix[:, t + 1])
                print(rxn_flux_path)
                print("original epr")
                print(epr_path)
                print("all rewards:")
                #print(reward_vec[0:t+1])
        tau = t - n_back_step + 1

        if (tau >= 0):

            #THIS IS THE FORWARD
            estimate_value = torch.zeros(1, device=device)

            for i in range(tau + 1, min(tau + n_back_step, end_of_path) + 1):
                estimate_value += (gamma**(i - tau - 1)) * reward_vec[i]

            if ((tau + n_back_step) < end_of_path):
                value_tau_n = state_value(
                    nn_model,
                    torch.from_numpy(
                        states_matrix[:,
                                      tau + n_back_step]).float().to(device))

                estimate_value += (gamma**(n_back_step)) * value_tau_n

            value_tau = state_value(
                nn_model,
                torch.from_numpy(states_matrix[:, tau]).float().to(device))

            if (value_tau.requires_grad == False):
                print('value tau broken')
            if (estimate_value.requires_grad == True):
                estimate_value.detach_()
            #THIS IS THE END OF FORWARD

            #WARNING
            #loss ordering should be input with requires_grad == True,
            #followed by target with requires_grad == False

            optimizer.zero_grad()

            loss = (loss_fn(value_tau, estimate_value))  #currently MSE

            loss.backward()

            clipping_value = 1.0
            #torch.nn.utils.clip_grad_value_(nn_model.parameters(), clipping_value)
            torch.nn.utils.clip_grad_norm_(nn_model.parameters(),
                                           clipping_value)

            optimizer.step()

            average_loss.append(loss.item())

        if (tau >= (end_of_path - 1)):
            break

    #after episode is finished, take average loss
    average_loss_episode = np.mean(average_loss)
    #print(average_loss)
    print("index of max error on path")
    print(average_loss.index(max(average_loss)))
    #print("All rewards")
    #print(reward_vec[0:t+1])

    return [sum_reward_episode, average_loss_episode,max(average_loss),final_reward, final_state, final_KQ_f,final_KQ_r,\
            reached_terminal_state, random_steps_taken,nn_steps_taken]