def policy_evaluation(self, policy, context, desired_action, delta, r, epsilon): if policy != 'LinThompSamp': print("We don't support other bandit algorithms now!") else: historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() policy = linthompsamp.LinThompSamp(self.actions, historystorage, modelstorage, self.d, delta, r, epsilon) seq_error = np.zeros(shape=(self.t, 1)) for t in range(self.t): history_id, action = policy.get_action(context[t]) if desired_action[t][0] != action: policy.reward(history_id, 0) # sum_error += 1 if t == 0: seq_error[t] = 1.0 else: seq_error[t] = seq_error[t - 1] + 1.0 else: policy.reward(history_id, 1) if t > 0: seq_error[t] = seq_error[t - 1] return seq_error
def policy_evaluation(self, policy, context, desired_action, models, delta): if policy != 'EXP4P': print("We don't support other bandit algorithms now!") else: historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() seq_error = np.zeros(shape=(self.t, 1)) policy = exp4p.Exp4P(self.actions, historystorage, modelstorage, models, delta, pmin=None) for t in range(self.t): history_id, action = policy.get_action(context[t]) if desired_action[t][0] != action: policy.reward(history_id, 0) # sum_error += 1 if t == 0: seq_error[t] = 1.0 else: seq_error[t] = seq_error[t - 1] + 1.0 else: policy.reward(history_id, 1) if t > 0: seq_error[t] = seq_error[t - 1] return seq_error
def policy_generation(bandit, actions): historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() if bandit == 'Exp4P': policy = exp4p.Exp4P(actions, historystorage, modelstorage, delta=0.5, pmin=None) elif bandit == 'LinUCB': policy = linucb.LinUCB(actions, historystorage, modelstorage, 0.3, 20) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp(actions, historystorage, modelstorage, d=20, delta=0.61, r=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(actions, historystorage, modelstorage) elif bandit == 'Exp3': policy = exp3.Exp3(actions, historystorage, modelstorage, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def setUp(self): self.modelstorage = model.MemoryModelStorage() self.historystorage = history.MemoryHistoryStorage() self.actions = [1, 2, 3] self.d = 2 self.delta = 0.5 self.R = 0.5 self.epsilon = 0.1
def train_movielens(max_iter=163683, batch_size=100): # 데이터 전처리 방법에 대해 알고자 한다면... # 참고: https://striatum.readthedocs.io/en/latest/auto_examples/index.html#general-examples streaming_batch = pd.read_csv('streaming_batch.csv', sep='\t', names=['user_id'], engine='c') user_feature = pd.read_csv('user_feature.csv', sep='\t', header=0, index_col=0, engine='c') arm_ids = list(pd.read_csv('actions.csv', sep='\t', header=0, engine='c')['movie_id']) reward_list = pd.read_csv('reward_list.csv', sep='\t', header=0, engine='c') streaming_batch = streaming_batch.iloc[0:max_iter] # 아래 n_actions 인자에서 처음 시점에서의 Action 의 개수를 정의 함 th = TreeHeuristic(history.MemoryHistoryStorage(), model.MemoryModelStorage(), action.MemoryActionStorage(), n_actions=50) actions = make_arm(arm_ids=arm_ids) reward_sum = 0 y = [] print("Starting Now...") start = time.time() for i in range(max_iter): context = np.array(user_feature[user_feature.index == streaming_batch.iloc[i, 0]])[0] if i == 0: th.build(first_context=context, actions=actions) history_id, recommendations = th.sample_from_beta(context=context) watched_list = reward_list[reward_list['user_id'] == streaming_batch.iloc[i, 0]] if recommendations.action.id not in list(watched_list['movie_id']): # 잘 못 맞췄으면 0점을 얻음 th.reward(history_id, {recommendations.action.id: 0.0}) th.update_D(context=context, action_id=recommendations.action.id, reward=0.0) else: # 잘 맞춨으면 1점을 얻음 th.reward(history_id, {recommendations.action.id: 1.0}) th.update_D(context=context, action_id=recommendations.action.id, reward=1.0) reward_sum += 1 if i % batch_size == 0 and i != 0: for action_chosen in th._action_storage.iterids(): th.update_tree(action_id=action_chosen) if i % 100 == 0: print("Step: {} -- Average Reward: {}".format(i, np.round(reward_sum / (i+1), 4))) y.append(reward_sum / (i + 1)) print("Time: {}".format(time.time() - start)) x = list(range(max_iter)) plt.figure() plt.plot(x, y, c='r') plt.title("Cumulative Average Reward of \n Tree Heuristic: Movie Lens Data") plt.show()
def train_covtype(n_samples=581000, batch_size=3000): file = pd.read_csv("covtype.data", header=None) data = file.values np.random.shuffle(data) X, temp = data[:, 0:54], data[:, 54] Y = pd.get_dummies(temp).values actions = make_arm(list(range(7))) th = TreeHeuristic(history.MemoryHistoryStorage(), model.MemoryModelStorage(), action.MemoryActionStorage(), n_actions=7) th.build(first_context=X[0], actions=actions) reward_sum = 0 y = [] print("Starting Now...") start = time.time() for i in range(n_samples): context = X[i] history_id, recommendations = th.sample_from_beta(context=context) # 실제 Reward 를 받고 이를 누적함 actual_reward = Y[i, recommendations.action.id] reward_sum += actual_reward th.reward(history_id, {recommendations.action.id: actual_reward}) # D는 매 trial 마다 업데이트해 주어야 함 th.update_D(context=context, action_id=recommendations.action.id, reward=actual_reward) # batch size 만큼을 모아서 적합해줌 if i % batch_size == 0 and i != 0: for action_chosen in th._action_storage.iterids(): th.update_tree(action_id=action_chosen) # 로그는 100개 마다 찍음 if i % 100 == 0: print("Step: {} -- Average Reward: {}".format(i, np.round(reward_sum / (i+1), 4))) y.append(reward_sum/(i+1)) print("Time: {}".format(time.time() - start)) x = list(range(n_samples)) y[0] = 0 plt.figure() plt.plot(x, y, c='r') plt.title("Cumulative Average Reward Flow of \n Tree Heuristic: Cover type Data") plt.show()
def setUp(self): self.modelstorage = model.MemoryModelStorage() self.historystorage = history.MemoryHistoryStorage() self.actions = [1, 2, 3, 4, 5] self.history_context = np.random.uniform(0, 5, (1000, 2)) self.history_action = np.zeros(1000) for t in range(1000): for i in range(5): if i * 2 < sum(self.history_context[t, :]) <= (i + 1) * 2: self.history_action[t] = self.actions[i] self.LogReg = OneVsRestClassifier(LogisticRegression()) self.MNB = OneVsRestClassifier(MultinomialNB()) self.LogReg.fit(self.history_context, self.history_action) self.MNB.fit(self.history_context, self.history_action)
def policy_generation(bandit, actions): """ Parameters ---------- bandit: 赌博机算法 actions:动作即推荐的电影 Returns ------- policy: 生成的策略 """ historystorage = history.MemoryHistoryStorage() # 内存中历史存储记录 modelstorage = model.MemoryModelStorage() # 内存中模型存储,为了统一 if bandit == 'Exp4P': policy = exp4p.Exp4P(historystorage, modelstorage, actions, delta=0.5, p_min=None) elif bandit == 'LinUCB': #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20) policy = linucb.LinUCB(history_storage=historystorage, model_storage=modelstorage, action_storage=actions, alpha=0.3, context_dimension=18) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp( historystorage, modelstorage, actions, #d=20, Supposed to be context dimension context_dimension=18, delta=0.61, R=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(historystorage, modelstorage, actions) elif bandit == 'Exp3': policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def policy_evaluation(self, policy, desired_action): if policy != 'UCB1': print("We don't support other bandit algorithms now!") else: historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() sum_error = 0 policy = ucb1.UCB1(self.actions, historystorage, modelstorage) for t in range(self.t): history_id, action = policy.get_action(context=None) if desired_action[t][0] != action: policy.reward(history_id, 0) sum_error += 1 else: policy.reward(history_id, 1) return self.t - sum_error
def learn_policy(self, data_parser): np.random.seed(self.seed) while True: try: _, context, rewards = next(data_parser) curr_action_ids = set(context.keys()) if self._action_storage.count(): prev_action_ids = set(self._action_storage.iterids()) new_action_ids = curr_action_ids.difference( prev_action_ids) old_action_ids = prev_action_ids.difference( curr_action_ids) if new_action_ids: curr_actions = self._make_action(new_action_ids) self.policy.add_action(curr_actions) if old_action_ids: for old_action_id in old_action_ids: self.policy.remove_action(old_action_id) else: curr_actions = self._make_action(curr_action_ids) self.policy.add_action(curr_actions) obs_action_id = max(rewards, key=rewards.get) history_id, recommendations = self.policy.get_action( context, 1) if obs_action_id == recommendations[0].action.id: if (self.data_size is None) or (np.random.uniform() <= self.data_size): self.policy.reward(history_id, rewards) self.reward_values = np.append( self.reward_values, np.array(rewards[obs_action_id])) self._n_iter += 1 if (self.reset_freq is not None) and (self._n_iter % self.reset_freq == 0): del self.policy._history_storage _ = gc.collect() self.policy._history_storage = history.MemoryHistoryStorage( ) if (self.max_iter is not None) and (self._n_iter >= self.max_iter): break except StopIteration: break
def __init__(self, policy, **params): self.seed = None self.data_size = None self.max_iter = None self.reset_freq = None if 'seed' in params: self.seed = params.pop('seed') if 'data_size' in params: self.data_size = params.pop('data_size') if 'max_iter' in params: self.max_iter = params.pop('max_iter') if 'reset_freq' in params: self.reset_freq = params.pop('reset_freq') self._params = params self._history_storage = history.MemoryHistoryStorage() self._model_storage = model.MemoryModelStorage() self._action_storage = action.MemoryActionStorage() self.policy = policy(self._history_storage, self._model_storage, self._action_storage, **self._params) self.reward_values = np.array([]) self._n_iter = 0
def policy_generation(bandit, actions): historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() if bandit == 'Exp4P': policy = exp4p.Exp4P(historystorage, modelstorage, actions, delta=0.5, p_min=None) elif bandit == 'LinUCB': #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20) policy = linucb.LinUCB(history_storage=historystorage, model_storage=modelstorage, action_storage=actions, alpha=0.3, context_dimension=18) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp( historystorage, modelstorage, actions, #d=20, Supposed to be context dimension context_dimension=18, delta=0.61, R=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(historystorage, modelstorage, actions) elif bandit == 'Exp3': policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def setUp(self): self.modelstorage = model.MemoryModelStorage() self.historystorage = history.MemoryHistoryStorage() self.actions = [1, 2, 3] self.alpha = 1.00
def setUp(self): self.modelstorage = model.MemoryModelStorage() self.historystorage = history.MemoryHistoryStorage() self.actions = [1, 2, 3, 4, 5] self.gamma = 0.5