def test_agent( env, agent, num_offline_users = 1000, num_online_users = 100, num_organic_offline_users = 100, num_epochs = 1, epoch_with_random_reset = False ): successes = 0 failures = 0 argss = [ { 'env': env, 'agent': agent, 'num_offline_users': num_offline_users, 'num_online_users': num_online_users, 'num_organic_offline_users': num_organic_offline_users, 'epoch_with_random_reset': epoch_with_random_reset, 'epoch': epoch, } for epoch in range(num_epochs) ] for result in [_collect_stats(args) for args in argss]: successes += result[AgentStats.SUCCESSES] failures += result[AgentStats.FAILURES] return ( beta.ppf(0.500, successes + 1, failures + 1), beta.ppf(0.025, successes + 1, failures + 1), beta.ppf(0.975, successes + 1, failures + 1) )
def test_agent(env, agent, num_offline_users=1000, num_online_users=100,num_organic_offline_users=100, num_epochs=1, log_file=None): # open optional logging log = LogFile(log_file) # initialize user id to 1 for logging purposes user_id = 1 # Offline organic Training ------------------------------------------------------- print("Starting Agent Training") for i in range(num_epochs): env.__init__() # Reset the env for repeated sequences for u in range(num_organic_offline_users): env.reset() observation, _, _, _ = env.step(None) agent.train(observation, None, None, True) # Offline Training ------------------------------------------------------- for i in range(num_epochs): env.__init__() # Reset the env for repeated sequences for u in range(num_offline_users): env.reset() observation, _, done, _ = env.step(None) while not done: old_observation = observation action, observation, reward, done, info = env.step_offline() agent.train(old_observation, action, reward, done) if i == (num_epochs-1): log.write(user_id, False, observation, action, reward) if i == (num_epochs-1): user_id += 1 # Online Testing --------------------------------------------------------- suc = 0 fail = 0 print("Starting Agent Testing") for _ in range(num_online_users): env.reset() observation, _, done, _ = env.step(None) reward = None done = None while not done: action = agent.act(observation, reward, done) observation, reward, done, info = env.step(action) user_id += 1 if reward: suc = suc + 1 else: fail = fail + 1 return ( beta.ppf(0.5, suc+1, fail+1), beta.ppf(0.025, suc+1, fail+1), beta.ppf(0.975, suc+1, fail+1) )
def train_eval_online(env, num_users, agent, mode='train'): """ Trains or evaluates the agent in the environment by sampling a given number of users :param env: recommendation environment :param num_users: number of users to sample from environment :param agent: agent :param mode: train or eval :return: tuple of agent class, 50% quantile of CTR, 2.5% quantile of CTR, 97.5% quantile of CTR, execution time """ num_clicks, num_displays = 0, 0 start = time.time() for _ in range(num_users): env.reset() observation, _, _, _ = env.step(None) reward, done = None, False while not done: # choose action action = agent.act(observation) # execute action in the environment next_observation, reward, done, info = env.step(action['a']) # train on the feedback if mode == 'train': agent.train(observation, action['a'], reward, done) # compute click through rate num_clicks += 1 if reward == 1 and reward is not None else 0 num_displays += 1 if reward == 0 and reward is not None else 0 # update observation observation = next_observation end = time.time() result = (type(agent).__name__, beta.ppf(0.500, num_clicks + 1, num_displays + 1), beta.ppf(0.025, num_clicks + 1, num_displays + 1), beta.ppf(0.975, num_clicks + 1, num_displays + 1), end - start) return result
def test_agent( env, agent, num_offline_users = 1000, num_online_users = 100, num_organic_offline_users = 100, num_epochs = 1, epoch_with_random_reset = False ): successes = 0 failures = 0 with Pool(processes = multiprocessing.cpu_count()) as pool: #with NoDaemonProcessPool(processes = multiprocessing.cpu_count()) as pool: argss = [ { 'env': env, 'agent': agent, 'num_offline_users': num_offline_users, 'num_online_users': num_online_users, 'num_organic_offline_users': num_organic_offline_users, 'epoch_with_random_reset': epoch_with_random_reset, 'epoch': epoch, } for epoch in range(num_epochs) ] for result in [_collect_stats(args) for args in argss] if num_epochs == 1 else pool.map(_collect_stats, argss): successes += result[AgentStats.SUCCESSES] failures += result[AgentStats.FAILURES] return ( beta.ppf(0.500, successes + 1, failures + 1), beta.ppf(0.025, successes + 1, failures + 1), beta.ppf(0.975, successes + 1, failures + 1) )
def getPR(b): NBANDITS = b.nBandits MAX_CHOICESB = 101 # INIT wins = np.zeros(NBANDITS) puls = np.zeros(NBANDITS) for bandit, i in zip(b.bandits, range(NBANDITS)): wins[i] = sum(bandit.payoffs) puls[i] = len(bandit.payoffs) # print "sum " + str(sum(bandit.payoffs)) # print "cli " + str(bandit.clicked) # print "len " + str(len(bandit.payoffs)) # print "puls " + str(wins) # print "wins " + str(puls) # CHOOSE apar = np.array([1] * NBANDITS) + wins bpar = np.array([1] * NBANDITS) + (puls - wins) # print "a" + str(apar) # print "b" + str(bpar) xpoints = np.zeros(shape=(MAX_CHOICESB - 1, NBANDITS)) probability = np.arange(0 + (1 / ((MAX_CHOICESB - 1) * 1.0)), 1 + (1 / ((MAX_CHOICESB - 1) * 1.0)), 1 / ((MAX_CHOICESB - 1) * 1.0)) for i in range(NBANDITS): xpoints[:, i] = sbeta.ppf(probability, apar[i], bpar[i]) # print xpoints[ : ,i] # print "xpoints " + str(xpoints) idx = np.argmax(xpoints, axis=1) # print idx # print sum(idx) unique, counts = np.unique(idx, return_counts=True) # unique = np.array([2,1]) pro = np.zeros(NBANDITS) pro[unique] = counts probs = pro / ((1.0) * (MAX_CHOICESB - 1)) # UPDATE AUTOMATICALLY for bandit, i in zip(b.bandits, range(NBANDITS)): bandit.PR = probs[i]
def rbeta(alpha, beta, size=None): """ Random beta variates. """ return sbeta.ppf(np.random.random(size), alpha, beta)