Beispiel #1
0
def incrementally_improve_V(vf, threshold, dir_points):
    is_multiple_v = False

    value_functions = [[]]
    value_functions[0].append(vf[0])
    for v in range(max_demand+1,0,-1):
        value_functions[0].append(vf[-v])
    
    #this loop iterates incrementally with the latest value function 
    #to further improve upon
    for i in range(num_iterations_for_vf):
        #print("iterative vf",value_functions)
        
        threshold = np.zeros((tuple_size, action_count))
        rmdp = crobust.MDP(0, discount_factor)

        pos=0
        for s in range(mdp.state_count()):
            actions = mdp.action_count(s)
            for a in range(actions):

                if len(mdp.get_toids(s,a))==0:
                    continue
                
                threshold[0,pos] = s
                threshold[1,pos] = a
                
                threshold[2,pos] = construct_rmdp(s, a, value_functions, rmdp, dir_points, is_multiple_v) 
                
                pos += 1

        #print("MDP: ",rmdp.to_json())

        sol = rmdp.rsolve_vi("robust_l1".encode(),threshold)
        vf = sol.valuefunction
        value_functions = [[]]

        #As S,s policy is used as random policy to generate samples, 
        #the possible inventory levels are, o & (max_inventory - max_demand) 
        #to max_inventory, inclusive. So filter out the value functions
        #for possible states.
        value_functions[0].append(vf[0])
        for v in range(max_demand+1,0,-1):
            value_functions[0].append(vf[-v])
    return value_functions[0][0] #initial states value of 0th value function
Beispiel #2
0
def randomly_improve_V(vf, threshold, dir_points):
    is_multiple_v = True

    value_functions = [[]]
    value_functions[0].append(vf[0])
    for s in range(max_demand+1,0,-1):
        value_functions[0].append(vf[-s])

    #this loop iterates incrementally with the latest value function 
    #to further improve upon
    for i in range(1,num_iterations_for_vf+1,1):
        #print("random vf",value_functions)
        
        threshold = np.zeros((tuple_size, action_count))
    
        #value_functions.append(np.random.randint(10, size=(max_demand-min_demand+2))) 
        rmdp = crobust.MDP(0, discount_factor)
        pos=0
        
        print("num iteration----",i)
        for s in range(mdp.state_count()):
            actions = mdp.action_count(s)
            for a in range(actions):

                if len(mdp.get_toids(s,a))==0:
                    continue
                threshold[0,pos] = s
                threshold[1,pos] = a
                threshold[2,pos] = construct_rmdp(s, a, value_functions, rmdp, dir_points, is_multiple_v) 
                pos += 1
        #print("MDP: ",rmdp.to_json())

        sol = rmdp.rsolve_vi("robust_l1".encode(),threshold)
        vf = sol.valuefunction
        
        value_functions.append([])
        value_functions[i].append(vf[0])
        for s in range(max_demand+1,0,-1):
            value_functions[i].append(vf[-s])
        
    return vf[0]
Beispiel #3
0
    for s in range(num_states):
        # action left
        transitions[s, 0, max(s - 1, 0)] = 1

        # action right
        transitions[s, 1, s] = 0.4 if s == 0 else 0.6
        transitions[s, 1,
                    min(s + 1, num_states -
                        1)] = 0.6 if s == 0 else (0.6 if s == num_states -
                                                  1 else 0.35)
        transitions[s, 1,
                    max(s -
                        1, 0)] = 0.4 if s == 0 else (0.4 if s == num_states -
                                                     1 else 0.05)

    true_mdp = crobust.MDP(0, discount_factor)
    for s in range(num_states):
        true_mdp.add_transition(s, 0, max(s - 1,
                                          0), transitions[s, 0,
                                                          max(s - 1, 0)],
                                rewards[s, 0, max(s - 1, 0)])

        true_mdp.add_transition(s, 1, s, transitions[s, 1, s], rewards[s, 1,
                                                                       s])
        if s < num_states - 1:
            true_mdp.add_transition(
                s, 1, min(s + 1, num_states - 1),
                transitions[s, 1, min(s + 1, num_states - 1)],
                rewards[s, 1, min(s + 1, num_states - 1)])
        if s > 0:
            true_mdp.add_transition(s, 1, max(s - 1,
def OFVF(num_states, num_actions, num_next_states, valuefunctions, posterior_transition_points, num_update, sa_confidence, discount_factor):
    """
    Method to incrementally improve value function by adding the new value function with 
    previous valuefunctions, finding the nominal point & threshold for this cluster of value functions
    with the required sa-confidence.
    
    @value_function The initially known value function computed from the true MDP
    @posterior_transition_points The posterior transition points obtained from the Bayesian sampling, 
                                    nominal point & threshold to be computed
    @num_update Number of updates over the value functions
    @sa_confidence Required confidence for each state-action computed from the Union Bound
    @orig_sol The solution to the estimated true MDP
    
    @return valuefunction The updated final value function
    """
    horizon = 1
    #s = 0
    valuefunctions = [valuefunctions]

    #Store the nominal points for each state-action pairs
    nomianl_points = {}
    
    #Store the latest nominal of nominal point & threshold
    nominal_threshold = {}
    under_estimate, real_regret = 0.0, 0.0
    i=0
    while i <= num_update:
        #try:
        #keep track whether the current iteration keeps the mdp unchanged
        is_mdp_unchanged = True
        threshold = [[] for _ in range(3)]
        rmdp = crobust.MDP(0, discount_factor)
        #print("update", i)
        for s in range(num_states):
            for a in range(num_actions):
                
                bayes_points = np.asarray(posterior_transition_points[s,a])
    
                RSVF_nomianlPoints = []
                
                #for bayes_points in trans:
                #print("bayes_points", bayes_points, "valuefunctions[-1]", valuefunctions[-1])
                ivf = construct_uset_known_value_function(bayes_points, valuefunctions[-1], sa_confidence)
                RSVF_nomianlPoints.append(ivf[2])
                new_trp = np.mean(RSVF_nomianlPoints, axis=0)
                
                if (s,a) not in nomianl_points:
                    nomianl_points[(s,a)] = []
                
                trp, th = None, 0
                #If there's a previously constructed L1 ball. Check whether the new nominal point
                #resides outside of the current L1 ball & needs to be considered.
                if (s,a) in nominal_threshold:
                    old_trp, old_th = nominal_threshold[(s,a)][0], nominal_threshold[(s,a)][1]
                    
                    #Compute the L1 distance between the newly computed nominal point & the previous 
                    #nominal of nominal points
                    new_th = np.linalg.norm(new_trp - old_trp, ord = 1)
                    
                    #If the new point is inside the previous L1 ball, don't consider it & proceed with
                    #the previous trp & threshold
                    if  (new_th - old_th) < 0.0001:
                        trp, th = old_trp, old_th
                
                #Consider the new nominal point to construct a new uncertainty set. This block will
                #execute if there's no previous nominal_threshold entry or the new nominal point
                #resides outside of the existing L1 ball
                if trp is None:
                    is_mdp_unchanged = False
                    nomianl_points[(s,a)].append(new_trp)
                    
                    #Find the center of the L1 ball for the nominal points with different 
                    #value functions
                    trp, th = find_nominal_point(np.asarray(nomianl_points[(s,a)]))
                    nominal_threshold[(s,a)] = (trp, th)
                
                threshold[0].append(s)
                threshold[1].append(a)
                threshold[2].append(th)
                
                trp /= np.sum(trp)
                #Add the current transition to the RMDP
                for next_st in range(num_next_states):
                    rmdp.add_transition(s, a, next_st, trp[int(next_st)], rewards[s,a,next_st])
        
        #Solve the current RMDP
        rsol = rmdp.rsolve_mpi(b"optimistic_l1",threshold)
        
        #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty
        #set for any state-action, no need to iterate more!
        if is_mdp_unchanged or i==num_update-1:
            #print("**** Add Values *****")
            #print("MDP remains unchanged after number of iteration:",i)
            #print("rsol.valuefunction",rsol.valuefunction)  
                            
            return rsol
        
        valuefunction = rsol.valuefunction
        valuefunctions.append(valuefunction)
        i+=1
def Optimism_VF(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, confidence, num_bayes_samples, num_episodes, num_runs, horizon, true_solution):  
    #num_bayes_samples = 20
    num_update = 10
    
    confidences = []
    regret_OFVF = np.zeros( (num_runs, num_episodes) )
    violations = np.zeros( (num_runs, num_episodes) )
    
    for m in range(num_runs):
        # Initialize uniform Dirichlet prior
        prior = np.ones( (num_states, num_actions, num_next_states) )
        samples = np.zeros( (num_states, num_actions, num_next_states) )
        posterior = prior + samples
        # Run episodes for the PSRL
        for k in range(num_episodes):
            sampled_mdp = crobust.MDP(0, discount_factor)
            confidence = 1-1/(k+1)
            sa_confidence = 1-(1-confidence)/(num_states*num_actions) # !!! Apply union bound to compute confidence for each state-action
            if m==0:
                confidences.append(confidence)
            
            # Compute posterior
            posterior = posterior+samples
            thresholds = [[] for _ in range(3)]
            
            posterior_transition_points = {}
            for s in range(num_states):
                for a in range(num_actions):
                    bayes_samples = np.random.dirichlet(posterior[s,a], num_bayes_samples)
                    posterior_transition_points[(s,a)] = bayes_samples
                    
                    nominal_point_bayes = np.mean(bayes_samples, axis=0)
                    nominal_point_bayes /= np.sum(nominal_point_bayes)
                    
                    bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence)
    
                    for s_next in range(num_next_states):
                        sampled_mdp.add_transition(s, a, s_next, nominal_point_bayes[s_next], rewards[s,a,s_next])
                        
                    # construct the threshold for each state-action
                    thresholds[0].append(s) # from state
                    thresholds[1].append(a) # action
                    thresholds[2].append(bayes_threshold) # allowed deviation
            
            # Compute current solution
            cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) # solve_mpi()
            
            rsol = OFVF(num_states, num_actions, num_next_states, cur_solution[0], posterior_transition_points, num_update, sa_confidence, discount_factor)
            
            regret_OFVF[m,k] = abs(rsol[0][0] - true_solution[0][0])
            
            violations[m,k] = rsol[0][0] - true_solution[0][0]
            
            rpolicy = rsol.policy

            samples = np.zeros((num_states, num_actions, num_next_states))

            # Follow the policy to collect transition samples
            cur_state = 0
            for h in range(horizon):
                action = rpolicy[cur_state]
                #print("cur_state", cur_state, "cur_action", action)
                next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0]
                #print("next_state", next_state)
                samples[cur_state, action, next_state] += 1
                cur_state = next_state

    #regret_OFVF = np.mean(regret_OFVF, axis=1)
    #plt.plot(np.cumsum(regret_OFVF))
    #plt.show()
    
    violations = np.mean(violations<0, axis=0)
    
    return np.amin(regret_OFVF, axis=0), np.mean(regret_OFVF, axis=0), violations, confidences
def UCRL2(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, num_episodes, num_runs, horizon, true_solution):
    """
    Implements the UCRL2 algorithm described in Jaksch2010 (Near-optimal Regret Bounds for Reinforcement Learning) paper.
    
    Parameters
    ----------
    num_states : int
        Number of states in the MDP
    num_actions : int
        Number of actions in the MDP
    num_next_states : int
        Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly.
    true_transitions : numpy array
        num_states x num_actions x num_next_states dimensional array containing tru transition parameters.
    rewards : numpy array
        num_states dimensional array containing rewards for each state
    discount_factor : float
        Discount factor for the MDP
    num_episodes : int
        Number of episodes to run
    num_runs : int
        Number of runs in each episode, to take average of
    true_solution : Solution object of CRAAM
        The solution of the true MDP
        
    Returns
    --------
    numpy array
        Computed regret
    """
    # ***start UCRL2***
    #num_next_states = num_states
    regret_ucrl = np.zeros( (num_runs, num_episodes) )
    
    #num_runs, num_episodes = 1, 1
    
    for m in range(num_runs):
        t=1
        # state-action counts
        Nk = np.zeros( (num_states, num_actions) )
        Nk_ = np.zeros( (num_states, num_actions) )
        
        # accumulated rewards
        Rk = np.zeros( (num_states, num_actions) )
        
        # accumulated transition counts, initialized to uniform transition
        Pk = np.ones( (num_states, num_actions, num_next_states) )/num_next_states
    
        for k in range(num_episodes):

            # ***Initialize
            tk = t #set the start time of episode k
            Vk = np.zeros( (num_states, num_actions) ) # init the state-action count for episode k
            Nk += Nk_
            r_hat = [ Rk[s,a]/max(1,Nk[s,a]) for s in range(num_states) for a in range(num_actions)]
            p_hat = np.array([ Pk[s,a,s_next]/max(1,Nk[s,a]) for s in range(num_states) for a in range(num_actions)\
                                                                for s_next in range(num_next_states) ]).reshape((num_states, num_actions,num_next_states))
            for s in range(num_states):
                for a in range(num_actions):
                    p_hat[s,a] /= np.sum(p_hat[s,a])
                    
            # ***Compute policy
            psi_r = [math.sqrt(7*math.log(2*num_next_states*num_actions*tk/confidence)/(2*max(1,Nk[s,a]))) for s in range(num_states) for a in range(num_actions)]
            psi_a = [math.sqrt(14*num_next_states*math.log(2*num_actions*tk/confidence)/(max(1,Nk[s,a]))) for s in range(num_states) for a in range(num_actions)]
            
            estimated_mdp = crobust.MDP(0, discount_factor)
            thresholds = [[] for _ in range(3)]
            
            for s in range(num_states):
                for a in range(num_actions):
                    
                    for s_next in range(num_next_states):
                        estimated_mdp.add_transition(s, a, s_next, p_hat[s,a,s_next], rewards[s,a,s_next]) # as the reward is upper bounded by psi_r from mean reward r_hat
                    # construct the threshold for each state-action
                    thresholds[0].append(s) # from state
                    thresholds[1].append(a) # action
                    thresholds[2].append(psi_a[a]) # allowed deviation
            #print(estimated_mdp.to_json())

            computed_solution = estimated_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds))
            computed_policy = computed_solution.policy

            regret_ucrl[m,k] = abs(abs(computed_solution[0][0])-true_solution[0][0])
        
            # ***Execute policy
            Nk_ = np.zeros( (num_states, num_actions) )
            cur_state = 0

            for h in range(horizon): #Vk[action] < max(1,Nk[action]):
                action = computed_policy[cur_state]
                next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0]
                reward = rewards[cur_state,action,next_state]
                Vk[cur_state, action] += 1
                t += 1
                
                Rk[cur_state, action] += 1
                Pk[cur_state, action, next_state] += 1
                Nk_[cur_state, action] += 1
                cur_state = next_state

    #regret_ucrl = np.mean(regret_ucrl, axis=0)
    return np.amin(regret_ucrl, axis=0), np.mean(regret_ucrl, axis=0)
def PSRL(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, num_episodes, num_runs, horizon, true_solution):
    """
    Implements the Posterior Sampling RL algorithm described in Osband2013 ((More) Efficient Reinforcement Learning via Posterior Sampling) paper.
    
    Parameters
    ----------
    num_states : int
        Number of states in the MDP
    num_actions : int
        Number of actions in the MDP
    num_next_states : int
        Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly.
    true_transitions : numpy array
        num_states x num_actions x num_next_states dimensional array containing tru transition parameters.
    rewards : numpy array
        num_states x action x num_states dimensional array containing rewards for each state
    discount_factor : float
        Discount factor for the MDP
    num_episodes : int
        Number of episodes to run
    num_runs : int
        Number of runs in each episode, to take average of
    true_solution : Solution object of CRAAM
        The solution of the true MDP
    
    Returns
    --------
    numpy array
        Computed regret
    """
    
    regret_psrl = np.zeros( (num_runs, num_episodes) )
    
    for m in range(num_runs):
        # Initialize uniform Dirichlet prior
        prior = np.ones( (num_states, num_actions, num_next_states) )    
        samples = np.zeros( (num_states, num_actions, num_next_states) )
        posterior = prior + samples
        # Run episodes for the PSRL
        for k in range(num_episodes):
            sampled_mdp = crobust.MDP(0, discount_factor)
            
            # Compute posterior
            posterior = posterior+samples
            
            for s in range(num_states):
                for a in range(num_actions):
                    trp =  np.random.dirichlet(posterior[s,a], 1)[0]
                    for s_next in range(num_next_states):
                        sampled_mdp.add_transition(s, a, s_next, trp[s_next], rewards[s,a,s_next])
    
            # Compute current solution
            cur_solution = sampled_mdp.solve_mpi()
            cur_policy = cur_solution.policy

            #action = cur_policy[0] # action for the nonterminal state 0
            regret_psrl[m,k] = abs(cur_solution[0][0]-true_solution[0][0])
            #print("PSRL cur_solution[0][0]",cur_solution[0][0], "Regret: ", regret_psrl[k,m])
            samples = np.zeros((num_states, num_actions, num_next_states))
            
            # Follow the policy to collect transition samples
            cur_state = 0
            for h in range(horizon):
                action = cur_policy[cur_state]
                next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0]
                samples[cur_state, action, next_state] += 1
                cur_state = next_state
                
    #regret_psrl = np.mean(regret_psrl, axis=1)
    return np.amin(regret_psrl, axis=0), np.mean(regret_psrl, axis=0)
def BayesUCRL(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, confidence, num_bayes_samples, num_episodes, num_runs, horizon, true_solution):
    """
    Implements the Bayes UCRL idea. Computes ambiguity set from posterior samples for required confidence levels.
    
    Parameters
    ----------
    num_states : int
        Number of states in the MDP
    num_actions : int
        Number of actions in the MDP
    num_next_states : int
        Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly.
    true_transitions : numpy array
        num_states x num_actions x num_next_states dimensional array containing tru transition parameters.
    rewards : numpy array
        num_states dimensional array containing rewards for each state
    discount_factor : float
        Discount factor for the MDP
    confidence : float
        The required PAC confidence
    num_bayes_samples : int
        Number of Bayes samples to be taken from posterior
    num_episodes : int
        Number of episodes to run
    num_runs : int
        Number of runs in each episode, to take average of
    true_solution : Solution object of CRAAM
        The solution of the true MDP
        
    Returns
    --------
    numpy array
        Computed regret
    """  
    #num_bayes_samples = 20
    
    regret_bayes_ucrl = np.zeros( (num_runs, num_episodes) )
    
    for m in range(num_runs):
        # Initialize uniform Dirichlet prior
        prior = np.ones( (num_states, num_actions, num_next_states) )    
        samples = np.zeros((num_states, num_actions, num_next_states))
        posterior = prior + samples
        # Run episodes for the PSRL
        for k in range(num_episodes):
            sampled_mdp = crobust.MDP(0, discount_factor)
            
            # Compute posterior
            posterior = posterior+samples
            thresholds = [[] for _ in range(3)]
            
            confidence = 1-1/(k+1)
            sa_confidence = 1-(1-confidence)/(num_states*num_actions) # !!! Apply union bound to compute confidence for each state-action
            
            for s in range(num_states):
                for a in range(num_actions):
                    bayes_samples =  np.random.dirichlet(posterior[s,a], num_bayes_samples)
                    nominal_point_bayes = np.mean(bayes_samples, axis=0)
                    nominal_point_bayes /= np.sum(nominal_point_bayes)
                    
                    bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence)
                    
                    for s_next in range(num_next_states):
                        sampled_mdp.add_transition(s, a, s_next, nominal_point_bayes[s_next], rewards[s,a,s_next])
                        
                    # construct the threshold for each state-action
                    thresholds[0].append(s) # from state
                    thresholds[1].append(a) # action
                    thresholds[2].append(bayes_threshold) # allowed deviation
            
            # Compute current solution
            cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds))
            cur_policy = cur_solution.policy
            #action = cur_policy[0] # action for the nonterminal state 0
            regret_bayes_ucrl[m,k] = abs(abs(cur_solution[0][0])-true_solution[0][0])
            #print("Bayes UCRL cur_solution[0][0]",cur_solution[0][0], "Regret: ", regret_bayes_ucrl[k,m])
            samples = np.zeros((num_states, num_actions, num_next_states))
                
            # Follow the policy to collect transition samples
            cur_state = 0
            for h in range(horizon):
                action = cur_policy[cur_state]
                #print("cur_state", cur_state, "cur_action", action)
                next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0]
                #print("next_state", next_state)
                samples[cur_state, action, next_state] += 1
                cur_state = next_state

    #regret_bayes_ucrl = np.mean(regret_bayes_ucrl, axis=1)
    return np.amin(regret_bayes_ucrl, axis=0), np.mean(regret_bayes_ucrl, axis=0)
Beispiel #9
0
    num_next_states = 3
    num_actions = 3
    discount_factor = 0.95
    num_episodes = 100
    num_runs = 10
    horizon = 10
    num_bayes_samples = 100

    # rewards for 3 possible next terminal states
    rewards = np.arange(1, num_next_states + 1, dtype=float) * 10

    # 3 possible actions, each action can lead to 3 terminal states with different probabilities. transitions[i] is the transition probability for action i.
    transitions = np.array([[0.6, 0.2, 0.2], [0.2, 0.6, 0.2], [0.2, 0.2, 0.6]])

    # Construct the true MDP with true parameters
    true_mdp = crobust.MDP(0, discount_factor)
    for a in range(num_actions):
        for s in range(num_next_states):
            true_mdp.add_transition(0, a, s + 1, transitions[a, s], rewards[s])
    true_solution = true_mdp.solve_mpi()
    print(true_solution)

### UCRL2
if __name__ == "__main__":
    # ***start UCRL2***
    t = 1
    regret_ucrl = np.zeros((num_runs, num_episodes))

    for m in range(num_runs):

        # state-action counts
def incrementally_replace_V(valuefunction, num_samples, num_simulation,\
                                                        num_update, sa_confidence, orig_sol):
    """
    Method to incrementally improve the value function by replacing the old value function with 
    the new one.
    
    @value_function The initially known value function
    @num_samples Number of samples to estimate the true distribution
    @num_simulation Number of simulation
    @num_update Number of updates over the value functions
    @sa_confidence Required confidence for each state-action from 
    
    @return valuefunction The updated final value function
    """
    horizon = 1
    X = []
    Y = []
    
    list_transitions_points = {}
    for s in population:
        #transitions_points = get_Bootstrapped_transition_reward(s, horizon,\
                                        #num_samples, np.random.randint(len(population)))
        #print("Incrementally replace V")
        transitions_points, _, _ = get_Bayesian_transition_kernel(s, num_samples)
        list_transitions_points[s] = transitions_points
    
    under_estimate = 99999
    real_regret = 0.0
    for i in range(num_update):
        threshold = [[] for _ in range(3)]
        rmdp = crobust.MDP(0, discount_factor)
        for s in population:
            #transitions_points = get_Bootstrapped_transition_reward(s, horizon, num_samples, i)
            transitions_points = list_transitions_points[s] #get_Bayesian_transition_kernel(s, num_samples)
            for a in range(num_actions):
                dir_points = np.asarray(transitions_points[a])
                res = construct_uset_known_value_function(dir_points, valuefunction, sa_confidence)
                
                threshold[0].append(s)
                threshold[1].append(a)
                threshold[2].append(res[1])
                
                trp = res[2]
                
                for next_st in population:
                    #reward = calc_reward(next_st, trp[int(next_st)], a)
                    reward = calc_reward(s, a)
                    rmdp.add_transition(s, a, next_st, trp[int(next_st)], reward)
        
        rsol = rmdp.rsolve_mpi(b"robust_l1",threshold)
        rpolicy = rsol.policy        
        violation = 0
        #rret = rmdp.solve_mpi(policy=rpolicy)
        ret = est_true_mdp.solve_mpi(policy=rpolicy)
        cur_regret = abs(np.dot(initial,ret.valuefunction) - np.dot(initial,rsol.valuefunction))
        if cur_regret>under_estimate:
            #ropt_sol = est_true_mdp.solve_mpi(policy=rpolicy)
            real_regret = abs(np.dot(initial,orig_sol.valuefunction) -\
                                                np.dot(initial,ret.valuefunction))
            violation = 1 if (np.dot(initial, ret.valuefunction) - np.dot(initial,\
                            rsol.valuefunction))<0 else 0
            break

        under_estimate = cur_regret
        valuefunction = rsol.valuefunction
        X.append(i)
        Y.append(valuefunction[0])

    return under_estimate, real_regret, violation
def RSVF(valuefunctions, posterior_transition_points, num_samples, num_update, \
            sa_confidence, orig_sol):
    """
    Method to incrementally improve value function by adding the new value function with 
    previous valuefunctions, finding the nominal point & threshold for this cluster of value functions
    with the required sa-confidence.
    
    @value_function The initially known value function computed from the true MDP
    @posterior_transition_points The posterior transition points obtained from the Bayesian sampling, 
                                    nominal point & threshold to be computed
    @num_samples Number of samples to estimate the true distribution
    @num_update Number of updates over the value functions
    @sa_confidence Required confidence for each state-action computed from the Union Bound
    @orig_sol The solution to the estimated true MDP
    
    @return valuefunction The updated final value function
    """
    horizon = 1
    X = []
    Y = []
    
    valuefunctions = [valuefunctions]
    th_list = []
    """
    list_transitions_points = {}
    for s in population:
        #transitions_points = get_Bootstrapped_transition_reward(s, horizon,\
                                        #num_samples, np.random.randint(len(population)))
        #print("incrementally add v")
        transitions_points, _, _ = get_Bayesian_transition_kernel(s, num_samples)
        list_transitions_points[s] = transitions_points
    """
    #Store the nominal points for each state-action pairs
    nomianl_points = {}
    
    #Store the latest nominal of nominal point & threshold
    nominal_threshold = {}
    under_estimate, real_regret = 0.0, 0.0
    i=0
    while i <= num_update:
        try:
            #keep track whether the current iteration keeps the mdp unchanged
            is_mdp_unchanged = True
            threshold = [[] for _ in range(3)]
            rmdp = crobust.MDP(0, discount_factor)
            for s in population:
                for a in range(num_actions):
                    
                    trans = np.asarray(posterior_transition_points[s][a])
                    RSVF_th = [] # ** Not being used
                    RSVF_nomianlPoints = []
                    
                    for dir_points in trans:
                        ivf = construct_uset_known_value_function(dir_points, valuefunctions[-1],\
                                                                sa_confidence)
                        RSVF_th.append(ivf[1])
                        RSVF_nomianlPoints.append(ivf[2])
                    new_trp = np.mean(RSVF_nomianlPoints, axis=0)
                    
                    if (s,a) not in nomianl_points:
                        nomianl_points[(s,a)] = []
                    
                    trp, th = None, 0
                    #If there's a previously constructed L1 ball. Check whether the new nominal point
                    #resides outside of the current L1 ball & needs to be considered.
                    if (s,a) in nominal_threshold:
                        old_trp, old_th = nominal_threshold[(s,a)][0], nominal_threshold[(s,a)][1]
                        
                        #Compute the L1 distance between the newly computed nominal point & the previous 
                        #nominal of nominal points
                        new_th = np.linalg.norm(new_trp - old_trp, ord = 1)
                        
                        #If the new point is inside the previous L1 ball, don't consider it & proceed with
                        #the previous trp & threshold
                        if  (new_th - old_th) < 0.0001:
                            trp, th = old_trp, old_th
                    
                    #Consider the new nominal point to construct a new uncertainty set. This block will
                    #execute if there's no previous nominal_threshold entry or the new nominal point
                    #resides outside of the existing L1 ball
                    if trp is None:
                        is_mdp_unchanged = False
                        nomianl_points[(s,a)].append(new_trp)
                        
                        #Find the center of the L1 ball for the nominal points with different 
                        #value functions
                        trp, th = find_nominal_point(np.asarray(nomianl_points[(s,a)]))
                        nominal_threshold[(s,a)] = (trp, th)
                    
                    threshold[0].append(s)
                    threshold[1].append(a)
                    threshold[2].append(th)
                    
                    #Add the current transition to the RMDP
                    for next_st in population:
                        #reward = calc_reward(next_st, trp[int(next_st)], a)
                        reward = calc_reward(s, a)
                        rmdp.add_transition(s, a, next_st, trp[int(next_st)], reward)
            
            #Solve the current RMDP
            rsol = rmdp.rsolve_mpi(b"robust_l1",threshold)
            
            violation = 0
            
            #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty
            #set for any state-action, no need to iterate more!
            if is_mdp_unchanged or i==num_update-1:
                print("**** Add Values *****")
                print("MDP remains unchanged after number of iteration:",i)
                #print("rmdp", rmdp.to_json())
                #print("threshold", threshold)
                #print("Policy",rsol.policy, "threshold", threshold)
                print("rsol.valuefunction",rsol.valuefunction)
                
                ropt_sol = est_true_mdp.solve_mpi(policy=rsol.policy)
                
                under_estimate = abs(np.dot(initial,orig_sol.valuefunction) -\
                                                        np.dot(initial,rsol.valuefunction))
                
                real_regret = abs(np.dot(initial,orig_sol.valuefunction) -\
                                                        np.dot(initial,ropt_sol.valuefunction))
                
                violation = 1 if (np.dot(initial, ropt_sol.valuefunction) - \
                                                np.dot(initial, rsol.valuefunction)) < 0 else 0
                break
            
            valuefunction = rsol.valuefunction
            valuefunctions.append(valuefunction)
            X.append(i)
            Y.append(valuefunction[0])
            i+=1
        except Exception as e:
            print("!!! Unexpected Error in RSVF !!!", sys.exc_info()[0])
            print(e)
            continue
        
    return under_estimate, real_regret, violation
 under_estimation = [[] for _ in range(Methods.NUM_METHODS.value)] #estimated regret
 real_regret = [[] for _ in range(Methods.NUM_METHODS.value)] #optimal regret
 violations = [[] for _ in range(Methods.NUM_METHODS.value)]
 
 #num_samples = sample_step
 #pbar = tqdm.tqdm(total = (sample_step*num_iterations+1) )
 
 #while num_samples <= (sample_step*num_iterations+1):
 for pos, num_samples in enumerate(tqdm.tqdm(sample_steps)):
     cur_under_estimation = np.zeros( (Methods.NUM_METHODS.value,runs) )
     cur_real_regret = np.zeros( (Methods.NUM_METHODS.value,runs) )
     cur_violations = np.zeros( (Methods.NUM_METHODS.value,runs) )
     i=0
     while i<runs:
         try:
             est_true_mdp = crobust.MDP(0, discount_factor)
             rmdps = []
             for m in range(Methods.NUM_METHODS.value):
                 rmdps.append(crobust.MDP(0, discount_factor))
             
             posterior_transition_points = {}
 
             for s in population:
                 #Get the nominal points & thresholds for each state & all actions of Bayes, Mean, Hoeff,
                 #HoeffTight RMDPs. Get the true transition points & the posterior transition points for RSVF
                 params, true_transition_points, posterior_transition_points[s] = \
                     evaluate_uncertainty_set(s, num_samples, num_simulation, sa_confidence)
                 
                 #Construct the true MDP with true transition points
                 for a in range(num_actions):
                     for next_st in population:
Beispiel #13
0
    def train(self, q_init_ret):
        """
        Implement posterior sampling RL algorithm for Mountain Car problem of OpenAI
        
        Returns
        --------
        worst case regret, average regret and the final solution
        """
        regret_psrl = np.zeros((self.num_runs, self.num_episodes))
        prior = obtain_parametric_priors(self.resolution, self.num_actions)

        # initially the posterior is the same as prior
        posterior = prior
        mp_rewards = {}

        for run in range(self.num_runs):
            print("run: ", run)

            # run episodes for the PSRL
            for episode in range(self.num_episodes):
                print("episode: ", episode)
                sampled_mdp = crobust.MDP(0, self.discount_factor)
                #thresholds = [[] for _ in range(3)]

                # iterate over all state-actions, sample from the posterior distribution and construct the sampled MDP
                for s in self.all_states:
                    p, v = obs_to_index(s, self.env_low, self.env_dx)
                    cur_state = index_to_single_index(p, v, self.resolution)

                    for action in range(self.num_actions):
                        samples = posterior[p, v, action]

                        next_states = []
                        visit_stats = []

                        # unbox key(next states) and values(Dirichlet prior parameters) from the samples dictionary.
                        for key, value in samples.items():
                            next_states.append(
                                index_to_single_index(key[0], key[1],
                                                      self.resolution))
                            visit_stats.append(value)

                        # sample from the drichlet distribution stated with the prior parameters
                        trp = np.random.dirichlet(visit_stats, 1)[0]

                        for s_index, s_next in enumerate(next_states):
                            sampled_mdp.add_transition(
                                cur_state, action, s_next, trp[s_index],
                                get_reward(s_next, self.resolution,
                                           self.grid_x, self.grid_y))

                        # construct the threshold for each state-action
                        #thresholds[0].append(cur_state) # from state
                        #thresholds[1].append(action) # action
                        #thresholds[2].append(1.0) # allowed deviation
                # Compute current solution
                cur_solution = sampled_mdp.solve_mpi()
                cur_policy = cur_solution.policy

                # Compute current solution
                #cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds))
                #cur_policy = cur_solution.policy

                # Initial state is uniformly distributed, compute the expected value over them.
                expected_value_initial_state = 0
                for init in self.init_states:
                    state = index_to_single_index(init[0], init[1],
                                                  self.resolution)
                    expected_value_initial_state += cur_solution[0][state]
                expected_value_initial_state /= len(self.init_states)

                # regret computation needs to be implemented. The Q-learning solution can be considered as the true soultion.
                # the solution produced here by PSRL is the approximate solution and the difference between them is the regret
                regret_psrl[run, episode] = abs(
                    q_init_ret - expected_value_initial_state
                )  #abs(cur_solution[0][0]-true_solution[0][0])
                print(q_init_ret, expected_value_initial_state)

                # Follow the policy to collect transition samples
                cur_state = obs_to_index(self.env.reset(), self.env_low,
                                         self.env_dx)
                for h in range(self.horizon):
                    action = cur_policy[index_to_single_index(
                        cur_state[0], cur_state[1], self.resolution)]
                    next_state, reward, done, info = self.env.step(action)
                    next_state = obs_to_index(next_state, self.env_low,
                                              self.env_dx)
                    mp_rewards[index_to_single_index(next_state[0],
                                                     next_state[1],
                                                     self.resolution)] = reward
                    # posterior[cur_position, cur_velocity, action][next_position, next_velocity] is the entry that we wanna update
                    # with the sample. This is really combining the current sample with the prior, which constitutes the posterior.
                    if (next_state[0],
                            next_state[1]) not in posterior[cur_state[0],
                                                            cur_state[1],
                                                            action]:
                        posterior[cur_state[0], cur_state[1],
                                  action][next_state[0], next_state[1]] = 0
                    posterior[cur_state[0], cur_state[1],
                              action][next_state[0], next_state[1]] += 1
                    cur_state = next_state

                    if done:
                        print("----- destination reached in", h,
                              "steps, done execution. -----")
                        break

        return np.amin(regret_psrl, axis=0), np.mean(regret_psrl,
                                                     axis=0), cur_solution
 def train(self, num_bayes_samples, q_init_ret):
     """
     Implements the Bayes UCRL idea. Computes ambiguity set from posterior samples for required confidence levels.
         
     Returns
     --------
     numpy array
         Computed regret
     """  
     #num_bayes_samples = 20
     
     regret_bayes_ucrl = np.zeros( (self.num_runs, self.num_episodes) )
     prior = obtain_parametric_priors(self.resolution, self.num_actions)
 
     # initially the posterior is the same as prior
     posterior = prior
     
     for run in range(self.num_runs):
         for episode in range(self.num_episodes):
             sampled_mdp = crobust.MDP(0, self.discount_factor)
             
             # Compute posterior
             #posterior = posterior+samples
             thresholds = [[] for _ in range(3)]
             
             num_states = self.resolution*self.resolution
             confidence = 1-1/(episode+1)
             sa_confidence = 1-(1-confidence)/(num_states*self.num_actions) # !!! Apply union bound to compute confidence for each state-action
             
             # iterate over all state-actions, sample from the posterior distribution and construct the sampled MDP
             for s in self.all_states:
                 p,v = obs_to_index(s, self.env_low, self.env_dx)
                 
                 print("s",s,p,v)
                 cur_state = index_to_single_index(p,v, self.resolution)
                 
                 for action in range(self.num_actions):
                     samples = posterior[p,v,action]
                     
                     next_states = []
                     visit_stats = []
                     
                     # unbox key(next states) and values(Dirichlet prior parameters) from the samples dictionary.
                     for key, value in samples.items():
                         next_states.append(index_to_single_index(key[0],key[1], self.resolution))
                         visit_stats.append(value)
                     
                     # sample from the drichlet distribution stated with the prior parameters
                     # trp = np.random.dirichlet(visit_stats, 1)
                     
                     bayes_samples =  np.random.dirichlet(visit_stats, num_bayes_samples)
                     nominal_point_bayes = np.mean(bayes_samples, axis=0)
                     nominal_point_bayes /= np.sum(nominal_point_bayes)
                     
                     bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence)
                     
                     for s_index, s_next in enumerate(next_states):
                         sampled_mdp.add_transition(cur_state, action, s_next, nominal_point_bayes[s_index], get_reward(s_next, self.resolution, self.grid_x, self.grid_y))
                     
                     # construct the threshold for each state-action
                     thresholds[0].append(cur_state) # from state
                     thresholds[1].append(action) # action
                     thresholds[2].append(bayes_threshold) # allowed deviation
             
             # Compute current solution
             cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds))
             cur_policy = cur_solution.policy
             
             # Initial state is uniformly distributed, compute the expected value over them.
             expected_value_initial_state = 0
             for init in self.init_states:
                 state = index_to_single_index(init[0], init[1], self.resolution)
                 expected_value_initial_state += cur_solution[0][state]
             expected_value_initial_state /= len(self.init_states)
             
             # regret computation needs to be implemented. The Q-learning solution can be considered as the true soultion.
             # the solution produced here by PSRL is the approximate solution and the difference between them is the regret
             regret_bayes_ucrl[run,episode] = abs(q_init_ret-expected_value_initial_state) #abs(cur_solution[0][0]-true_solution[0][0])
             
             # Follow the policy to collect transition samples
             cur_state = obs_to_index(self.env.reset(), self.env_low, self.env_dx)
             for h in range(self.horizon):
                 action = cur_policy[index_to_single_index(cur_state[0], cur_state[1], self.resolution)]
                 next_state, reward, done, info = self.env.step(action)
                 next_state = obs_to_index(next_state, self.env_low, self.env_dx)
                 
                 # posterior[cur_position, cur_velocity, action][next_position, next_velocity] is the entry that we wanna update 
                 # with the sample. This is really combining the current sample with the prior, which constitutes the posterior.
                 if (next_state[0],next_state[1]) not in posterior[cur_state[0],cur_state[1],action]:
                     posterior[cur_state[0],cur_state[1],action][next_state[0],next_state[1]] = 0
                 posterior[cur_state[0],cur_state[1],action][next_state[0],next_state[1]] += 1
                 cur_state = next_state
                 
                 if done:
                     print("----- destination reached in",h,"steps, done execution. -----")
                     break
     
     return np.amin(regret_bayes_ucrl, axis=0), np.mean(regret_bayes_ucrl, axis=0), cur_solution
Beispiel #15
0
def incrementally_add_V(valuefunctions, num_samples, num_simulation,\
                                                    num_update, sa_confidence, orig_sol):
    """
    Method to incrementally improve value function by adding the new value function with 
    previous valuefunctions, finding the nominal point & threshold for this cluster of value functions
    with the required sa-confidence.
    
    @value_function The initially known value function
    @num_samples Number of samples to estimate the true distribution
    @num_simulation Number of simulation
    @num_update Number of updates over the value functions
    @sa_confidence Required confidence for each state-action from 
    
    @return valuefunction The updated final value function
    """

    X = []
    Y = []

    valuefunctions = [valuefunctions]
    th_list = []
    list_transitions_points = {}

    for s in range(num_total_states):
        for a in range(num_total_actions):
            transitions_points = np.random.dirichlet(transition_samples[s, a],
                                                     bayes_samples)
            list_transitions_points[(s, a)] = transitions_points

    #Store the nominal points for each state-action pairs
    nomianl_points = {}

    #Store the latest nominal of nominal point & threshold
    nominal_threshold = {}
    under_estimate, real_regret = 0.0, 0.0

    for i in range(num_update):
        #print("valuefunctions",i,": ",valuefunctions)
        #keep track whether the current iteration keeps the mdp unchanged
        is_mdp_unchanged = True
        threshold = [[] for _ in range(3)]
        rmdp = crobust.MDP(0, discount_factor)
        for s in range(num_total_states):
            for a in range(num_total_actions):
                dir_points = list_transitions_points[(
                    s, a)]  #np.asarray(transitions_points[a])

                res = construct_uset_known_value_function(dir_points, valuefunctions[-1],\
                                                            sa_confidence)

                if (s, a) not in nomianl_points:
                    nomianl_points[(s, a)] = []

                trp, th = None, 0
                #If there's a previously constructed L1 ball. Check whether the new nominal point
                #needs to be considered.
                if (s, a) in nominal_threshold:
                    old_trp, old_th = nominal_threshold[(
                        s, a)][0], nominal_threshold[(s, a)][1]

                    #Compute the L1 distance between the newly computed nominal point & the previous
                    #nominal of nominal points
                    new_th = np.linalg.norm(res[2] - old_trp, ord=1)

                    #If the new point is inside the previous L1 ball, don't consider it & proceed with
                    #the previous trp & threshold
                    if (new_th - old_th) < 0.0001:
                        trp, th = old_trp, old_th

                #Consider the new nominal point to construct a new uncertainty set. This block will
                #execute if there's no previous nominal_threshold entry or the new nominal point
                #resides outside
                if trp is None:
                    #print(i,"trp is None")
                    is_mdp_unchanged = False
                    nomianl_points[(s, a)].append(res[2])

                    #Find the center of the L1 ball for the nominal points with different
                    #value functions
                    trp, th = find_nominal_point(
                        np.asarray(nomianl_points[(s, a)]))
                    nominal_threshold[(s, a)] = (trp, th)

                threshold[0].append(s)
                threshold[1].append(a)
                threshold[2].append(th)

                for next_st in range(num_total_states):
                    reward = state_action_reward[(index_to_state[s],
                                                  index_to_action[a])]
                    rmdp.add_transition(s, a, next_st, trp[int(next_st)],
                                        reward)

        rsol = rmdp.rsolve_mpi(b"robust_l1", threshold)

        violation = 0

        #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty
        #set for any state-action, no need to iterate more!
        if is_mdp_unchanged or i == num_update - 1:
            print("**** Add Values *****")
            print("MDP remains unchanged after number of iteration:", i)
            print("Policy", rsol.policy, "threshold", threshold)

            rpolicy = rsol.policy
            ret = est_true_mdp.solve_mpi(policy=rpolicy)
            under_estimate = np.dot(initial, ret.valuefunction) - np.dot(
                initial, rsol.valuefunction)

            #ropt_sol = rmdp.solve_mpi(policy=orig_sol.policy)
            real_regret = np.dot(initial,orig_sol.valuefunction) -\
                                                np.dot(initial,ret.valuefunction)

            violation = 1 if (np.dot(initial, ret.valuefunction) - np.dot(initial,\
                            rsol.valuefunction))<0 else 0
            break

        valuefunction = rsol.valuefunction
        valuefunctions.append(valuefunction)
        X.append(i)
        Y.append(valuefunction[0])

    return under_estimate, real_regret, violation