Ejemplo n.º 1
0
def test_agent(
        env,
        agent,
        num_offline_users = 1000,
        num_online_users = 100,
        num_organic_offline_users = 100,
        num_epochs = 1,
        epoch_with_random_reset = False
):
    successes = 0
    failures = 0

    argss = [
        {
            'env': env,
            'agent': agent,
            'num_offline_users': num_offline_users,
            'num_online_users': num_online_users,
            'num_organic_offline_users': num_organic_offline_users,
            'epoch_with_random_reset': epoch_with_random_reset,
            'epoch': epoch,
        }
        for epoch in range(num_epochs)
    ]

    for result in [_collect_stats(args) for args in argss]:
        successes += result[AgentStats.SUCCESSES]
        failures += result[AgentStats.FAILURES]

    return (
        beta.ppf(0.500, successes + 1, failures + 1),
        beta.ppf(0.025, successes + 1, failures + 1),
        beta.ppf(0.975, successes + 1, failures + 1)
    )
Ejemplo n.º 2
0
def test_agent(env, agent, num_offline_users=1000, num_online_users=100,num_organic_offline_users=100,
               num_epochs=1, log_file=None):

    # open optional logging
    log = LogFile(log_file)

    # initialize user id to 1 for logging purposes
    user_id = 1

    # Offline organic Training -------------------------------------------------------
    print("Starting Agent Training")
    for i in range(num_epochs):
        env.__init__()  # Reset the env for repeated sequences
        for u in range(num_organic_offline_users):
            env.reset()
            observation, _, _, _ = env.step(None)
            agent.train(observation, None, None, True)


    # Offline Training -------------------------------------------------------
    for i in range(num_epochs):
        env.__init__()  # Reset the env for repeated sequences
        for u in range(num_offline_users):
            env.reset()
            observation, _, done, _ = env.step(None)
            while not done:
                old_observation = observation
                action, observation, reward, done, info = env.step_offline()
                agent.train(old_observation, action, reward, done)
                if i == (num_epochs-1):
                    log.write(user_id, False, observation, action, reward)
            if i == (num_epochs-1):
                user_id += 1

    # Online Testing ---------------------------------------------------------
    suc = 0
    fail = 0
    print("Starting Agent Testing")
    for _ in range(num_online_users):
        env.reset()
        observation, _, done, _ = env.step(None)
        reward = None
        done = None
        while not done:
            action = agent.act(observation, reward, done)
            observation, reward, done, info = env.step(action)

            user_id += 1
            if reward:
                suc = suc + 1
            else:
                fail = fail + 1

    return (
        beta.ppf(0.5, suc+1, fail+1),
        beta.ppf(0.025, suc+1, fail+1),
        beta.ppf(0.975, suc+1, fail+1)
        )
Ejemplo n.º 3
0
def train_eval_online(env, num_users, agent, mode='train'):
    """
    Trains or evaluates the agent in the environment by sampling a given number of users

    :param env: recommendation environment
    :param num_users: number of users to sample from environment
    :param agent: agent
    :param mode: train or eval
    :return: tuple of agent class, 50% quantile of CTR, 2.5% quantile of CTR, 97.5% quantile of CTR, execution time
    """
    num_clicks, num_displays = 0, 0

    start = time.time()
    for _ in range(num_users):
        env.reset()
        observation, _, _, _ = env.step(None)
        reward, done = None, False
        while not done:
            # choose action
            action = agent.act(observation)
            # execute action in the environment
            next_observation, reward, done, info = env.step(action['a'])
            # train on the feedback
            if mode == 'train':
                agent.train(observation, action['a'], reward, done)
            # compute click through rate
            num_clicks += 1 if reward == 1 and reward is not None else 0
            num_displays += 1 if reward == 0 and reward is not None else 0
            # update observation
            observation = next_observation

    end = time.time()
    result = (type(agent).__name__,
              beta.ppf(0.500, num_clicks + 1, num_displays + 1),
              beta.ppf(0.025, num_clicks + 1, num_displays + 1),
              beta.ppf(0.975, num_clicks + 1, num_displays + 1), end - start)
    return result
Ejemplo n.º 4
0
def test_agent(
        env,
        agent,
        num_offline_users = 1000,
        num_online_users = 100,
        num_organic_offline_users = 100,
        num_epochs = 1,
        epoch_with_random_reset = False
):
    successes = 0
    failures = 0

    with Pool(processes = multiprocessing.cpu_count()) as pool:
        #with NoDaemonProcessPool(processes = multiprocessing.cpu_count()) as pool:
        argss = [
            {
                'env': env,
                'agent': agent,
                'num_offline_users': num_offline_users,
                'num_online_users': num_online_users,
                'num_organic_offline_users': num_organic_offline_users,
                'epoch_with_random_reset': epoch_with_random_reset,
                'epoch': epoch,
            }
            for epoch in range(num_epochs)
        ]

        for result in [_collect_stats(args) for args in argss] if num_epochs == 1 else pool.map(_collect_stats, argss):
            successes += result[AgentStats.SUCCESSES]
            failures += result[AgentStats.FAILURES]

    return (
        beta.ppf(0.500, successes + 1, failures + 1),
        beta.ppf(0.025, successes + 1, failures + 1),
        beta.ppf(0.975, successes + 1, failures + 1)
    )
Ejemplo n.º 5
0
def getPR(b):
    NBANDITS = b.nBandits
    MAX_CHOICESB = 101

    # INIT
    wins = np.zeros(NBANDITS)
    puls = np.zeros(NBANDITS)
    for bandit, i in zip(b.bandits, range(NBANDITS)):
        wins[i] = sum(bandit.payoffs)
        puls[i] = len(bandit.payoffs)
#        print "sum " + str(sum(bandit.payoffs))
#        print "cli " + str(bandit.clicked)
#        print "len " + str(len(bandit.payoffs))
#    print "puls " + str(wins)
#    print "wins " + str(puls)

# CHOOSE
    apar = np.array([1] * NBANDITS) + wins
    bpar = np.array([1] * NBANDITS) + (puls - wins)
    #    print "a" + str(apar)
    #    print "b" + str(bpar)

    xpoints = np.zeros(shape=(MAX_CHOICESB - 1, NBANDITS))

    probability = np.arange(0 + (1 / ((MAX_CHOICESB - 1) * 1.0)),
                            1 + (1 / ((MAX_CHOICESB - 1) * 1.0)),
                            1 / ((MAX_CHOICESB - 1) * 1.0))

    for i in range(NBANDITS):
        xpoints[:, i] = sbeta.ppf(probability, apar[i], bpar[i])
#        print  xpoints[ : ,i]
#    print "xpoints " + str(xpoints)

    idx = np.argmax(xpoints, axis=1)
    #    print idx
    #    print sum(idx)

    unique, counts = np.unique(idx, return_counts=True)
    #    unique = np.array([2,1])
    pro = np.zeros(NBANDITS)
    pro[unique] = counts
    probs = pro / ((1.0) * (MAX_CHOICESB - 1))

    # UPDATE AUTOMATICALLY
    for bandit, i in zip(b.bandits, range(NBANDITS)):
        bandit.PR = probs[i]
Ejemplo n.º 6
0
def rbeta(alpha, beta, size=None):
    """
    Random beta variates.
    """
    return sbeta.ppf(np.random.random(size), alpha, beta)