Beispiel #1
0
 def policy_evaluation(self, policy, context, desired_action, delta, r,
                       epsilon):
     if policy != 'LinThompSamp':
         print("We don't support other bandit algorithms now!")
     else:
         historystorage = history.MemoryHistoryStorage()
         modelstorage = model.MemoryModelStorage()
         policy = linthompsamp.LinThompSamp(self.actions, historystorage,
                                            modelstorage, self.d, delta, r,
                                            epsilon)
         seq_error = np.zeros(shape=(self.t, 1))
         for t in range(self.t):
             history_id, action = policy.get_action(context[t])
             if desired_action[t][0] != action:
                 policy.reward(history_id, 0)
                 # sum_error += 1
                 if t == 0:
                     seq_error[t] = 1.0
                 else:
                     seq_error[t] = seq_error[t - 1] + 1.0
             else:
                 policy.reward(history_id, 1)
                 if t > 0:
                     seq_error[t] = seq_error[t - 1]
         return seq_error
Beispiel #2
0
 def policy_evaluation(self, policy, context, desired_action, models,
                       delta):
     if policy != 'EXP4P':
         print("We don't support other bandit algorithms now!")
     else:
         historystorage = history.MemoryHistoryStorage()
         modelstorage = model.MemoryModelStorage()
         seq_error = np.zeros(shape=(self.t, 1))
         policy = exp4p.Exp4P(self.actions,
                              historystorage,
                              modelstorage,
                              models,
                              delta,
                              pmin=None)
         for t in range(self.t):
             history_id, action = policy.get_action(context[t])
             if desired_action[t][0] != action:
                 policy.reward(history_id, 0)
                 # sum_error += 1
                 if t == 0:
                     seq_error[t] = 1.0
                 else:
                     seq_error[t] = seq_error[t - 1] + 1.0
             else:
                 policy.reward(history_id, 1)
                 if t > 0:
                     seq_error[t] = seq_error[t - 1]
         return seq_error
def policy_generation(bandit, actions):
    historystorage = history.MemoryHistoryStorage()
    modelstorage = model.MemoryModelStorage()

    if bandit == 'Exp4P':
        policy = exp4p.Exp4P(actions,
                             historystorage,
                             modelstorage,
                             delta=0.5,
                             pmin=None)

    elif bandit == 'LinUCB':
        policy = linucb.LinUCB(actions, historystorage, modelstorage, 0.3, 20)

    elif bandit == 'LinThompSamp':
        policy = linthompsamp.LinThompSamp(actions,
                                           historystorage,
                                           modelstorage,
                                           d=20,
                                           delta=0.61,
                                           r=0.01,
                                           epsilon=0.71)

    elif bandit == 'UCB1':
        policy = ucb1.UCB1(actions, historystorage, modelstorage)

    elif bandit == 'Exp3':
        policy = exp3.Exp3(actions, historystorage, modelstorage, gamma=0.2)

    elif bandit == 'random':
        policy = 0

    return policy
Beispiel #4
0
 def setUp(self):
     self.modelstorage = model.MemoryModelStorage()
     self.historystorage = history.MemoryHistoryStorage()
     self.actions = [1, 2, 3]
     self.d = 2
     self.delta = 0.5
     self.R = 0.5
     self.epsilon = 0.1
Beispiel #5
0
def train_movielens(max_iter=163683, batch_size=100):
    # 데이터 전처리 방법에 대해 알고자 한다면...
    # 참고: https://striatum.readthedocs.io/en/latest/auto_examples/index.html#general-examples

    streaming_batch = pd.read_csv('streaming_batch.csv', sep='\t', names=['user_id'], engine='c')
    user_feature = pd.read_csv('user_feature.csv', sep='\t', header=0, index_col=0, engine='c')
    arm_ids = list(pd.read_csv('actions.csv', sep='\t', header=0, engine='c')['movie_id'])
    reward_list = pd.read_csv('reward_list.csv', sep='\t', header=0, engine='c')

    streaming_batch = streaming_batch.iloc[0:max_iter]

    # 아래 n_actions 인자에서 처음 시점에서의 Action 의 개수를 정의 함
    th = TreeHeuristic(history.MemoryHistoryStorage(), model.MemoryModelStorage(),
                       action.MemoryActionStorage(), n_actions=50)
    actions = make_arm(arm_ids=arm_ids)

    reward_sum = 0
    y = []

    print("Starting Now...")
    start = time.time()

    for i in range(max_iter):
        context = np.array(user_feature[user_feature.index == streaming_batch.iloc[i, 0]])[0]

        if i == 0:
            th.build(first_context=context, actions=actions)

        history_id, recommendations = th.sample_from_beta(context=context)

        watched_list = reward_list[reward_list['user_id'] == streaming_batch.iloc[i, 0]]

        if recommendations.action.id not in list(watched_list['movie_id']):
            # 잘 못 맞췄으면 0점을 얻음
            th.reward(history_id, {recommendations.action.id: 0.0})
            th.update_D(context=context, action_id=recommendations.action.id, reward=0.0)

        else:
            # 잘 맞춨으면 1점을 얻음
            th.reward(history_id, {recommendations.action.id: 1.0})
            th.update_D(context=context, action_id=recommendations.action.id, reward=1.0)
            reward_sum += 1

        if i % batch_size == 0 and i != 0:
            for action_chosen in th._action_storage.iterids():
                th.update_tree(action_id=action_chosen)

        if i % 100 == 0:
            print("Step: {} -- Average Reward: {}".format(i, np.round(reward_sum / (i+1), 4)))

        y.append(reward_sum / (i + 1))

    print("Time: {}".format(time.time() - start))
    x = list(range(max_iter))
    plt.figure()
    plt.plot(x, y, c='r')
    plt.title("Cumulative Average Reward of \n Tree Heuristic: Movie Lens Data")
    plt.show()
Beispiel #6
0
def train_covtype(n_samples=581000, batch_size=3000):
    file = pd.read_csv("covtype.data", header=None)
    data = file.values
    np.random.shuffle(data)

    X, temp = data[:, 0:54], data[:, 54]
    Y = pd.get_dummies(temp).values

    actions = make_arm(list(range(7)))

    th = TreeHeuristic(history.MemoryHistoryStorage(), model.MemoryModelStorage(),
                       action.MemoryActionStorage(), n_actions=7)

    th.build(first_context=X[0], actions=actions)

    reward_sum = 0
    y = []

    print("Starting Now...")
    start = time.time()

    for i in range(n_samples):

        context = X[i]
        history_id, recommendations = th.sample_from_beta(context=context)

        # 실제 Reward 를 받고 이를 누적함
        actual_reward = Y[i, recommendations.action.id]
        reward_sum += actual_reward

        th.reward(history_id, {recommendations.action.id: actual_reward})

        # D는 매 trial 마다 업데이트해 주어야 함
        th.update_D(context=context, action_id=recommendations.action.id, reward=actual_reward)

        # batch size 만큼을 모아서 적합해줌
        if i % batch_size == 0 and i != 0:
            for action_chosen in th._action_storage.iterids():
                th.update_tree(action_id=action_chosen)

        # 로그는 100개 마다 찍음
        if i % 100 == 0:
            print("Step: {} -- Average Reward: {}".format(i, np.round(reward_sum / (i+1), 4)))

        y.append(reward_sum/(i+1))

    print("Time: {}".format(time.time() - start))
    x = list(range(n_samples))
    y[0] = 0
    plt.figure()
    plt.plot(x, y, c='r')
    plt.title("Cumulative Average Reward Flow of \n Tree Heuristic: Cover type Data")
    plt.show()
Beispiel #7
0
 def setUp(self):
     self.modelstorage = model.MemoryModelStorage()
     self.historystorage = history.MemoryHistoryStorage()
     self.actions = [1, 2, 3, 4, 5]
     self.history_context = np.random.uniform(0, 5, (1000, 2))
     self.history_action = np.zeros(1000)
     for t in range(1000):
         for i in range(5):
             if i * 2 < sum(self.history_context[t, :]) <= (i + 1) * 2:
                 self.history_action[t] = self.actions[i]
     self.LogReg = OneVsRestClassifier(LogisticRegression())
     self.MNB = OneVsRestClassifier(MultinomialNB())
     self.LogReg.fit(self.history_context, self.history_action)
     self.MNB.fit(self.history_context, self.history_action)
Beispiel #8
0
def policy_generation(bandit, actions):
    """
    Parameters
    ----------
    bandit: 赌博机算法
    actions:动作即推荐的电影

    Returns
    -------
    policy: 生成的策略
    """
    historystorage = history.MemoryHistoryStorage()  # 内存中历史存储记录
    modelstorage = model.MemoryModelStorage()  # 内存中模型存储,为了统一

    if bandit == 'Exp4P':
        policy = exp4p.Exp4P(historystorage,
                             modelstorage,
                             actions,
                             delta=0.5,
                             p_min=None)

    elif bandit == 'LinUCB':
        #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20)
        policy = linucb.LinUCB(history_storage=historystorage,
                               model_storage=modelstorage,
                               action_storage=actions,
                               alpha=0.3,
                               context_dimension=18)

    elif bandit == 'LinThompSamp':
        policy = linthompsamp.LinThompSamp(
            historystorage,
            modelstorage,
            actions,
            #d=20, Supposed to be context dimension
            context_dimension=18,
            delta=0.61,
            R=0.01,
            epsilon=0.71)

    elif bandit == 'UCB1':
        policy = ucb1.UCB1(historystorage, modelstorage, actions)

    elif bandit == 'Exp3':
        policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2)

    elif bandit == 'random':
        policy = 0

    return policy
Beispiel #9
0
 def policy_evaluation(self, policy, desired_action):
     if policy != 'UCB1':
         print("We don't support other bandit algorithms now!")
     else:
         historystorage = history.MemoryHistoryStorage()
         modelstorage = model.MemoryModelStorage()
         sum_error = 0
         policy = ucb1.UCB1(self.actions, historystorage, modelstorage)
         for t in range(self.t):
             history_id, action = policy.get_action(context=None)
             if desired_action[t][0] != action:
                 policy.reward(history_id, 0)
                 sum_error += 1
             else:
                 policy.reward(history_id, 1)
         return self.t - sum_error
 def learn_policy(self, data_parser):
     np.random.seed(self.seed)
     while True:
         try:
             _, context, rewards = next(data_parser)
             curr_action_ids = set(context.keys())
             if self._action_storage.count():
                 prev_action_ids = set(self._action_storage.iterids())
                 new_action_ids = curr_action_ids.difference(
                     prev_action_ids)
                 old_action_ids = prev_action_ids.difference(
                     curr_action_ids)
                 if new_action_ids:
                     curr_actions = self._make_action(new_action_ids)
                     self.policy.add_action(curr_actions)
                 if old_action_ids:
                     for old_action_id in old_action_ids:
                         self.policy.remove_action(old_action_id)
             else:
                 curr_actions = self._make_action(curr_action_ids)
                 self.policy.add_action(curr_actions)
             obs_action_id = max(rewards, key=rewards.get)
             history_id, recommendations = self.policy.get_action(
                 context, 1)
             if obs_action_id == recommendations[0].action.id:
                 if (self.data_size is None) or (np.random.uniform() <=
                                                 self.data_size):
                     self.policy.reward(history_id, rewards)
                     self.reward_values = np.append(
                         self.reward_values,
                         np.array(rewards[obs_action_id]))
             self._n_iter += 1
             if (self.reset_freq
                     is not None) and (self._n_iter % self.reset_freq == 0):
                 del self.policy._history_storage
                 _ = gc.collect()
                 self.policy._history_storage = history.MemoryHistoryStorage(
                 )
             if (self.max_iter
                     is not None) and (self._n_iter >= self.max_iter):
                 break
         except StopIteration:
             break
 def __init__(self, policy, **params):
     self.seed = None
     self.data_size = None
     self.max_iter = None
     self.reset_freq = None
     if 'seed' in params:
         self.seed = params.pop('seed')
     if 'data_size' in params:
         self.data_size = params.pop('data_size')
     if 'max_iter' in params:
         self.max_iter = params.pop('max_iter')
     if 'reset_freq' in params:
         self.reset_freq = params.pop('reset_freq')
     self._params = params
     self._history_storage = history.MemoryHistoryStorage()
     self._model_storage = model.MemoryModelStorage()
     self._action_storage = action.MemoryActionStorage()
     self.policy = policy(self._history_storage, self._model_storage,
                          self._action_storage, **self._params)
     self.reward_values = np.array([])
     self._n_iter = 0
def policy_generation(bandit, actions):
    historystorage = history.MemoryHistoryStorage()
    modelstorage = model.MemoryModelStorage()

    if bandit == 'Exp4P':
        policy = exp4p.Exp4P(historystorage,
                             modelstorage,
                             actions,
                             delta=0.5,
                             p_min=None)

    elif bandit == 'LinUCB':
        #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20)
        policy = linucb.LinUCB(history_storage=historystorage,
                               model_storage=modelstorage,
                               action_storage=actions,
                               alpha=0.3,
                               context_dimension=18)

    elif bandit == 'LinThompSamp':
        policy = linthompsamp.LinThompSamp(
            historystorage,
            modelstorage,
            actions,
            #d=20, Supposed to be context dimension
            context_dimension=18,
            delta=0.61,
            R=0.01,
            epsilon=0.71)

    elif bandit == 'UCB1':
        policy = ucb1.UCB1(historystorage, modelstorage, actions)

    elif bandit == 'Exp3':
        policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2)

    elif bandit == 'random':
        policy = 0

    return policy
Beispiel #13
0
 def setUp(self):
     self.modelstorage = model.MemoryModelStorage()
     self.historystorage = history.MemoryHistoryStorage()
     self.actions = [1, 2, 3]
     self.alpha = 1.00
Beispiel #14
0
 def setUp(self):
     self.modelstorage = model.MemoryModelStorage()
     self.historystorage = history.MemoryHistoryStorage()
     self.actions = [1, 2, 3, 4, 5]
     self.gamma = 0.5