def PCRL(S, A, H, d, L, eps): # Make a very simple prior mu = 0. n_mu = 1. tau = 1. n_tau = 1. prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau) rew = 0 av_rew = [] c1 = len(S) prior_dir = np.ones(c1) R = {} P = {} time = range(H) for l in xrange(L): Rl = {} Pl = {} for t in time: for s in S: for a in A: if (t, s, a) not in R: R[(t, s, a)] = [] P[(t, s, a)] = np.array([0 for i in xrange(c1)]) if len(R[(t, s, a)]) == 0: Rpost = prior_ng Ppost = prior_dir else: data = np.array(R[(t, s, a)]) counts = P[(t, s, a)] Rpost = posterior_sampling.update_normal_ig( prior_ng, data) Ppost = posterior_sampling.update_dirichlet( prior_dir, counts) Rl[(t, s, a)] = posterior_sampling.sample_normal_ig(Rpost)[0] Pl[(t, s, a)] = posterior_sampling.sample_dirichlet(Ppost) mu = policy(R, P, Rl, Pl, S, A, H) rew += play(mu, H, R, P, A, eps) if (l + 1) % 10 == 0: av_rew.append(rew / float(10)) rew = 0 return av_rew
def PCRL(S,A,H,d,L,eps): # Make a very simple prior mu = 0. n_mu = 1. tau = 1. n_tau = 1. prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau) rew=0 av_rew=[] c1 = len(S) prior_dir = np.ones(c1) R = {} P = {} time = range(H); for l in xrange(L): Rl = {} Pl = {} for t in time: for s in S: for a in A: if (t,s,a) not in R: R[(t,s,a)]=[] P[(t,s,a)]=np.array([0 for i in xrange(c1)]) if len(R[(t,s,a)]) == 0: Rpost = prior_ng Ppost = prior_dir else: data = np.array(R[(t,s,a)]) counts = P[(t,s,a)] Rpost = posterior_sampling.update_normal_ig(prior_ng, data) Ppost = posterior_sampling.update_dirichlet(prior_dir, counts) Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0] Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost) mu = policy(R,P,Rl,Pl,S,A,H) rew += play(mu,H,R,P,A,eps) if (l+1)%10==0: av_rew.append(rew/float(10)) rew = 0 return av_rew
# Generate some real data real_mu = 1. real_prec = 4. n_data = 100 data = np.zeros(n_data) for i in range(n_data): data[i] = np.random.normal(real_mu, np.sqrt(1. / real_prec)) print 'True Normal distribution: ' + str((real_mu, real_prec)) + '\n' # Sampled data from the posterior posterior_ng = posterior_sampling.update_normal_ig(prior_ng, data) n_samp = 10 for i in range(n_samp): sample_norm = posterior_sampling.sample_normal_ig(posterior_ng) print 'Sampled Normal distribution: ' + str(sample_norm) print '\n \n ' #--------------------------------------------------------------------- # Updating transitions # Make a very simple prior n_state = 5 prior_dir = np.ones(n_state) # Imagine we have observed the following p_true = np.random.gamma(shape=1, size=n_state) p_true = p_true / np.sum(p_true) n_data = 100
def PSRL(S,A,H,L): """ Computes the number of episodes it takes for PSRL to experience a positive reward. IN S: list States A: list Actions H: int Number of states and time frame L: int Number of episodes OUT success: int Number of episodes before UCRL experiences a positive reward. """ # Make a very simple prior mu = 0. n_mu = 1. tau = 1. n_tau = 1. prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau) c1 = len(S) prior_dir = np.ones(c1) R = {} P = {} time = range(H); av_rew = [] rew = 0 for l in xrange(L): Rl = {} Pl = {} for t in time: for s in S: for a in A: if (t,s,a) not in R: R[(t,s,a)]=[] P[(t,s,a)]=np.array([0 for i in xrange(c1)]) # If we have not visited (t,s,a) we don't update our prior if len(R[(t,s,a)]) == 0: Rpost = prior_ng Ppost = prior_dir else: data = np.array(R[(t,s,a)]) counts = P[(t,s,a)] # Posterior updating Rpost = posterior_sampling.update_normal_ig(prior_ng, data) Ppost = posterior_sampling.update_dirichlet(prior_dir, counts) # Posterior sampling Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0] Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost) # Optimal policy mu = policy(R,P,Rl,Pl,S,A,H) #Episode rew += play(mu,H,R,P) if (l+1)%10==0: print rew/float(10) av_rew.append(rew/float(10)) rew = 0 return av_rew