def outputConfidenceKnownR(alg, nextStateMul, nObs): ''' Ouput the confidence set for a given algorithm, when P unknown. Args: alg - string for which algorithm to use nextStateMul - how many multiples of good/bad states there are nObs - how many observations split between the Returns: qMax - the 0.05 upper quantile ''' nState = 1 + 2 * nextStateMul nextObs = nObs / float(nState - 1) # Make the environment env = environment.make_confidenceMDP(nextStateMul) # Make the feature extractor f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent agent_constructor = alg_dict[alg] agent = agent_constructor(env.nState, env.nAction, env.epLen) # Letting the agent know the prior agent.R_prior[0, 0] = (0, 1e9) for s in range(1, nState): # Updating the P agent.P_prior[s, 0][s] += 1e9 # Updating the rewards agent.R_prior[s, 0] = (s % 2, 1e9) for ep in xrange(nObs): # Reset the environment env.reset() agent.update_policy(ep) pContinue = 1 while pContinue > 0: # Step through the episode h, oldState = f_ext.get_feat(env) action = 0 reward, newState, pContinue = env.advance(action) agent.update_obs(oldState, action, reward, newState, pContinue, h) agent.update_policy() return agent.qVals[0, 0][0]
def outputConfidenceH(alg, epLen, nObs): ''' Ouput the confidence set for a given algorithm, when epLen changes. Args: alg - string for which algorithm to use nextStateMul - how many multiples of good/bad states there are nObs - how many observations split between the Returns: qMax - the 0.05 upper quantile ''' # Make the environment env = environment.make_HconfidenceMDP(epLen) # Make the feature extractor f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent agent_constructor = alg_dict[alg] agent = agent_constructor(env.nState, env.nAction, env.epLen) for ep in xrange(nObs): # Reset the environment env.reset() agent.update_policy(ep) pContinue = 1 while pContinue > 0: # Step through the episode h, oldState = f_ext.get_feat(env) action = 0 reward, newState, pContinue = env.advance(action) agent.update_obs(oldState, action, reward, newState, pContinue, h) agent.update_policy() return agent.qVals[0, 0][0]
'%03.2f' % args.scaling + '_seed=' + str(args.seed) + '.csv') folderName = './' targetPath = folderName + fileName print('******************************************************************') print(fileName) print('******************************************************************') # Make the environment env = environment.make_hardBanditMDP(epLen=args.epLen, gap=args.gap, nAction=2, pSuccess=0.5) # Make the feature extractor f_ext = FeatureTrueState(env.epLen, env.nState, env.nAction, env.nState) # Make the agent alg_dict = { 'PSRL': finite_tabular_agents.PSRL, 'PSRLunif': finite_tabular_agents.PSRLunif, 'OptimisticPSRL': finite_tabular_agents.OptimisticPSRL, 'GaussianPSRL': finite_tabular_agents.GaussianPSRL, 'UCBVI': finite_tabular_agents.UCBVI, 'BEB': finite_tabular_agents.BEB, 'BOLT': finite_tabular_agents.BOLT, 'UCRL2': finite_tabular_agents.UCRL2, 'UCRL2_GP': finite_tabular_agents.UCRL2_GP, 'UCRL2_GP_RTDP': finite_tabular_agents.UCRL2_GP_RTDP, 'EULER': finite_tabular_agents.EULER, 'EULER_GP': finite_tabular_agents.EULER_GP,