def get_data(): streaming_batch = pd.read_csv('streaming_batch.csv', sep='\t', names=['user_id'], engine='c') user_feature = pd.read_csv('user_feature.csv', sep='\t', header=0, index_col=0, engine='c') actions_id = list( pd.read_csv('actions.csv', sep='\t', header=0, engine='c')['movie_id']) reward_list = pd.read_csv('reward_list.csv', sep='\t', header=0, engine='c') action_context = pd.read_csv('action_context.csv', sep='\t', header=0, engine='c') tempactions = [] for key in actions_id: action = Action(key) tempactions.append(action) actions = MemoryActionStorage() actions.add(tempactions) return streaming_batch, user_feature, actions, reward_list, action_context
def main(): n_rounds = 1000 context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) random_state = np.random.RandomState(0) # Parameter tuning tuning_region = np.arange(0.001, 1, 0.03) ctr_tuning = np.zeros(shape=len(tuning_region)) context1, desired_actions1 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=0) for gamma_i, gamma in enumerate(tuning_region): policy = Exp3(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, gamma=gamma, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_tuning[gamma_i] = n_rounds - cum_regret[-1] ctr_tuning /= n_rounds gamma_opt = tuning_region[np.argmax(ctr_tuning)] simulation.plot_tuning_curve(tuning_region, ctr_tuning, label="gamma changes") # Regret Analysis n_rounds = 10000 context2, desired_actions2 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=1) policy = Exp3(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, gamma=gamma_opt, random_state=random_state) for t in range(n_rounds): history_id, recommendation = policy.get_action(context2[t]) action_id = recommendation.action.id if desired_actions2[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()
def main(): n_rounds = 1000 context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) # Parameter tuning tuning_region = np.arange(0, 3, 0.05) ctr_tuning = np.empty(shape=len(tuning_region)) context1, desired_actions1 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=0) for alpha_i, alpha in enumerate(tuning_region): policy = LinUCB(history_storage=MemoryHistoryStorage(), model_storage=MemoryModelStorage(), action_storage=action_storage, context_dimension=context_dimension, alpha=alpha) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_tuning[alpha_i] = n_rounds - cum_regret[-1] ctr_tuning /= n_rounds alpha_opt = tuning_region[np.argmax(ctr_tuning)] simulation.plot_tuning_curve(tuning_region, ctr_tuning, label="alpha changes") # Regret Analysis n_rounds = 10000 context2, desired_actions2 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=1) policy = LinUCB(history_storage=MemoryHistoryStorage(), model_storage=MemoryModelStorage(), action_storage=action_storage, context_dimension=context_dimension, alpha=alpha_opt) for t in range(n_rounds): history_id, recommendation = policy.get_action(context2[t]) action_id = recommendation.action.id if desired_actions2[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()
def main(): n_rounds = 1000 context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) random_state = np.random.RandomState(0) # Parameter tuning tuning_region = np.arange(0.001, 1, 0.03) ctr_tuning = np.zeros(shape=len(tuning_region)) context1, desired_actions1 = simulation.simulate_data( n_rounds, context_dimension, action_storage, random_state=0) for gamma_i, gamma in enumerate(tuning_region): policy = Exp3(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, gamma=gamma, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_tuning[gamma_i] = n_rounds - cum_regret[-1] ctr_tuning /= n_rounds gamma_opt = tuning_region[np.argmax(ctr_tuning)] simulation.plot_tuning_curve(tuning_region, ctr_tuning, label="gamma changes") # Regret Analysis n_rounds = 10000 context2, desired_actions2 = simulation.simulate_data( n_rounds, context_dimension, action_storage, random_state=1) policy = Exp3(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, gamma=gamma_opt, random_state=random_state) for t in range(n_rounds): history_id, recommendation = policy.get_action(context2[t]) action_id = recommendation.action.id if desired_actions2[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()
def main(): context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) # Regret Analysis n_rounds = 10000 context, desired_actions = simulation.simulate_data( n_rounds, context_dimension, action_storage, random_state=1) policy = UCB1(MemoryHistoryStorage(), MemoryModelStorage(), action_storage) for t in range(n_rounds): history_id, recommendation = policy.get_action(context[t]) action_id = recommendation.action.id if desired_actions[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()
def main(): n_rounds = 1000 context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) random_state = np.random.RandomState(0) # Parameter tuning tuning_region = np.arange(0.01, 0.99, 0.1) ctr_delta = np.zeros(shape=len(tuning_region)) ctr_r = np.zeros(shape=len(tuning_region)) ctr_epsilon = np.zeros(shape=len(tuning_region)) context1, desired_actions1 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=0) for param_i, param in enumerate(tuning_region): policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=param, R=0.01, epsilon=0.5, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_delta[param_i] = n_rounds - cum_regret[-1] policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=0.5, R=param, epsilon=0.5, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_r[param_i] = n_rounds - cum_regret[-1] policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=0.5, R=0.01, epsilon=param, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_epsilon[param_i] = n_rounds - cum_regret[-1] ctr_delta /= n_rounds ctr_r /= n_rounds ctr_epsilon /= n_rounds delta_opt = tuning_region[np.argmax(ctr_delta)] r_opt = tuning_region[np.argmax(ctr_r)] epsilon_opt = tuning_region[np.argmax(ctr_epsilon)] # Plot the parameter tuning result plt.plot(np.arange(0.01, 0.99, 0.1), ctr_delta, 'ro-', label="delta changes, R = 0.01, eps = 0.5") plt.plot(np.arange(0.01, 0.99, 0.1), ctr_r, 'gs-', label="delta = 0.5, R = changes, eps = 0.5") plt.plot(np.arange(0.01, 0.99, 0.1), ctr_epsilon, 'b^-', label="delta = 0.5, R = 0.01, eps = changes") plt.xlabel('parameter value') plt.ylabel('CTR') plt.legend(bbox_to_anchor=(1., 0.7)) plt.ylim([0, 1]) plt.title("Parameter Tunning Curve - LinThompSamp") plt.show() # Regret Analysis n_rounds = 10000 context2, desired_actions2 = simulation.simulate_data(n_rounds, context_dimension, action_storage, random_state=1) policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=delta_opt, R=r_opt, epsilon=epsilon_opt, random_state=random_state) for t in range(n_rounds): history_id, recommendation = policy.get_action(context2[t]) action_id = recommendation.action.id if desired_actions2[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()
class BaseBanditTest(object): # pylint: disable=protected-access def setUp(self): # pylint: disable=invalid-name self.model_storage = MemoryModelStorage() self.history_storage = MemoryHistoryStorage() self.action_storage = MemoryActionStorage() self.actions = [Action(i + 1) for i in range(3)] self.action_storage.add(self.actions) def test_initialization(self): policy = self.policy self.assertEqual(self.model_storage, policy._model_storage) self.assertEqual(self.history_storage, policy._history_storage) self.assertEqual(self.history_storage, policy.history_storage) self.assertEqual(self.action_storage, policy._action_storage) def test_get_action_with_empty_storage(self): policy = self.policy_with_empty_action_storage context = {} history_id, recommendations = policy.get_action(context, 1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), 0) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_first_action(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, 1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), 1) self.assertIn(recommendations[0].action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_action_with_n_actions_none(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, None) self.assertEqual(history_id, 0) self.assertIsInstance(recommendations, Recommendation) self.assertIn(recommendations.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_all_action(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, -1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), len(self.actions)) for rec in recommendations: self.assertIn(rec.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_multiple_action(self): policy = self.policy n_actions = 2 context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, n_actions) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), n_actions) for rec in recommendations: self.assertIn(rec.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_update_reward(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, 1) rewards = {recommendations[0].action.id: 1.} policy.reward(history_id, rewards) self.assertEqual( policy._history_storage.get_history(history_id).rewards, rewards) def test_delay_reward(self): policy = self.policy context1 = {1: [1, 1], 2: [2, 2], 3: [3, 3]} context2 = {1: [0, 0], 2: [3, 3], 3: [6, 6]} history_id1, recommendations1 = policy.get_action(context1, 2) self.assertEqual(len(recommendations1), 2) history_id2, recommendations2 = policy.get_action(context2, 1) self.assertEqual(len(recommendations2), 1) rewards = { recommendations1[0].action.id: 0., recommendations1[1].action.id: 1., } policy.reward(history_id1, rewards) self.assertDictEqual( policy._history_storage.get_history(history_id1).context, context1) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id2).context, context2) self.assertDictEqual( policy._history_storage.get_history(history_id1).rewards, rewards) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id2).rewards, {}) def test_reward_order_descending(self): policy = self.policy context1 = {1: [1, 1], 2: [2, 2], 3: [3, 3]} context2 = {1: [0, 0], 2: [3, 3], 3: [6, 6]} history_id1, _ = policy.get_action(context1, 2) history_id2, recommendations2 = policy.get_action(context2) rewards = {recommendations2.action.id: 1.} policy.reward(history_id2, rewards) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id1).context, context1) self.assertDictEqual( policy._history_storage.get_history(history_id2).context, context2) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id1).rewards, {}) self.assertDictEqual( policy._history_storage.get_history(history_id2).rewards, rewards) def test_update_action(self): action = self.actions[1] action.action_type = "text" action.action_text = "hello" self.policy.update_action(action) updated_action = self.action_storage.get(action.id) self.assertEqual(updated_action.action_type, action.action_type) self.assertEqual(updated_action.action_text, action.action_text)
class BaseBanditTest(object): # pylint: disable=protected-access def setUp(self): # pylint: disable=invalid-name self.model_storage = MemoryModelStorage() self.history_storage = MemoryHistoryStorage() self.action_storage = MemoryActionStorage() self.actions = [Action(i + 1) for i in range(3)] self.action_storage.add(self.actions) def test_initialization(self): policy = self.policy self.assertEqual(self.model_storage, policy._model_storage) self.assertEqual(self.history_storage, policy._history_storage) self.assertEqual(self.history_storage, policy.history_storage) self.assertEqual(self.action_storage, policy._action_storage) def test_get_action_with_empty_storage(self): policy = self.policy_with_empty_action_storage context = {} history_id, recommendations = policy.get_action(context, 1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), 0) self.assertDictEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_first_action(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, 1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), 1) self.assertIn(recommendations[0].action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_action_with_n_actions_none(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, None) self.assertEqual(history_id, 0) self.assertIsInstance(recommendations, Recommendation) self.assertIn(recommendations.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_all_action(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, -1) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), len(self.actions)) for rec in recommendations: self.assertIn(rec.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_get_multiple_action(self): policy = self.policy n_actions = 2 context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, n_actions) self.assertEqual(history_id, 0) self.assertEqual(len(recommendations), n_actions) for rec in recommendations: self.assertIn(rec.action.id, self.action_storage.iterids()) self.assertEqual( policy._history_storage.get_unrewarded_history(history_id).context, context) def test_update_reward(self): policy = self.policy context = {1: [1, 1], 2: [2, 2], 3: [3, 3]} history_id, recommendations = policy.get_action(context, 1) rewards = {recommendations[0].action.id: 1.} policy.reward(history_id, rewards) self.assertEqual( policy._history_storage.get_history(history_id).rewards, rewards) def test_delay_reward(self): policy = self.policy context1 = {1: [1, 1], 2: [2, 2], 3: [3, 3]} context2 = {1: [0, 0], 2: [3, 3], 3: [6, 6]} history_id1, recommendations1 = policy.get_action(context1, 2) self.assertEqual(len(recommendations1), 2) history_id2, recommendations2 = policy.get_action(context2, 1) self.assertEqual(len(recommendations2), 1) rewards = { recommendations1[0].action.id: 0., recommendations1[1].action.id: 1., } policy.reward(history_id1, rewards) self.assertDictEqual( policy._history_storage.get_history(history_id1).context, context1) self.assertDictEqual( policy._history_storage.get_unrewarded_history( history_id2).context, context2) self.assertDictEqual( policy._history_storage.get_history(history_id1).rewards, rewards) self.assertDictEqual( policy._history_storage.get_unrewarded_history( history_id2).rewards, {}) def test_reward_order_descending(self): policy = self.policy context1 = {1: [1, 1], 2: [2, 2], 3: [3, 3]} context2 = {1: [0, 0], 2: [3, 3], 3: [6, 6]} history_id1, _ = policy.get_action(context1, 2) history_id2, recommendations2 = policy.get_action(context2) rewards = {recommendations2.action.id: 1.} policy.reward(history_id2, rewards) self.assertDictEqual( policy._history_storage.get_unrewarded_history( history_id1).context, context1) self.assertDictEqual( policy._history_storage.get_history(history_id2).context, context2) self.assertDictEqual( policy._history_storage.get_unrewarded_history( history_id1).rewards, {}) self.assertDictEqual( policy._history_storage.get_history(history_id2).rewards, rewards) def test_update_action(self): action = self.actions[1] action.action_type = "text" action.action_text = "hello" self.policy.update_action(action) updated_action = self.action_storage.get(action.id) self.assertEqual(updated_action.action_type, action.action_type) self.assertEqual(updated_action.action_text, action.action_text)
def main(): n_rounds = 1000 context_dimension = 5 action_storage = MemoryActionStorage() action_storage.add([Action(i) for i in range(5)]) random_state = np.random.RandomState(0) # Parameter tuning tuning_region = np.arange(0.01, 0.99, 0.1) ctr_delta = np.zeros(shape=len(tuning_region)) ctr_r = np.zeros(shape=len(tuning_region)) ctr_epsilon = np.zeros(shape=len(tuning_region)) context1, desired_actions1 = simulation.simulate_data( n_rounds, context_dimension, action_storage, random_state=0) for param_i, param in enumerate(tuning_region): policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=param, R=0.01, epsilon=0.5, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_delta[param_i] = n_rounds - cum_regret[-1] policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=0.5, R=param, epsilon=0.5, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_r[param_i] = n_rounds - cum_regret[-1] policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=0.5, R=0.01, epsilon=param, random_state=random_state) cum_regret = simulation.evaluate_policy(policy, context1, desired_actions1) ctr_epsilon[param_i] = n_rounds - cum_regret[-1] ctr_delta /= n_rounds ctr_r /= n_rounds ctr_epsilon /= n_rounds delta_opt = tuning_region[np.argmax(ctr_delta)] r_opt = tuning_region[np.argmax(ctr_r)] epsilon_opt = tuning_region[np.argmax(ctr_epsilon)] # Plot the parameter tuning result plt.plot(np.arange(0.01, 0.99, 0.1), ctr_delta, 'ro-', label="delta changes, R = 0.01, eps = 0.5") plt.plot(np.arange(0.01, 0.99, 0.1), ctr_r, 'gs-', label="delta = 0.5, R = changes, eps = 0.5") plt.plot(np.arange(0.01, 0.99, 0.1), ctr_epsilon, 'b^-', label="delta = 0.5, R = 0.01, eps = changes") plt.xlabel('parameter value') plt.ylabel('CTR') plt.legend(bbox_to_anchor=(1., 0.7)) plt.ylim([0, 1]) plt.title("Parameter Tunning Curve - LinThompSamp") plt.show() # Regret Analysis n_rounds = 10000 context2, desired_actions2 = simulation.simulate_data( n_rounds, context_dimension, action_storage, random_state=1) policy = LinThompSamp(MemoryHistoryStorage(), MemoryModelStorage(), action_storage, context_dimension=context_dimension, delta=delta_opt, R=r_opt, epsilon=epsilon_opt, random_state=random_state) for t in range(n_rounds): history_id, recommendation = policy.get_action(context2[t]) action_id = recommendation.action.id if desired_actions2[t] != action_id: policy.reward(history_id, {action_id: 0}) else: policy.reward(history_id, {action_id: 1}) policy.plot_avg_regret() plt.show()