def test_invalid_no_context_history(self): decisions = [1, 1, 1] rewards = [0, 0, 0] mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0), NeighborhoodPolicy.Radius(2)) with self.assertRaises(TypeError): mab.fit(decisions, rewards)
def test_invalid_no_context_policy(self): decisions = [1, 1, 1] rewards = [0, 0, 0] context_history = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0)) with self.assertRaises(TypeError): mab.fit(decisions, rewards, context_history)
def test_rewards_inf_df(self): history = pd.DataFrame({ 'decision': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'reward': [0, 0, 0, 0, 0, 0, 1, 1, np.inf] }) mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0)) with self.assertRaises(TypeError): mab.fit(history['decision'], history['reward'])
def predict( arms: List[Arm], decisions: Union[List, np.ndarray, pd.Series], rewards: Union[List, np.ndarray, pd.Series], learning_policy: Union[LearningPolicy.EpsilonGreedy, LearningPolicy.Random, LearningPolicy.Softmax, LearningPolicy.ThompsonSampling, LearningPolicy.UCB1, LearningPolicy.LinTS, LearningPolicy.LinUCB], neighborhood_policy: Union[None, NeighborhoodPolicy.Clusters, NeighborhoodPolicy.Radius, NeighborhoodPolicy.KNearest] = None, context_history: Union[None, List[Num], List[List[Num]], np.ndarray, pd.DataFrame, pd.Series] = None, contexts: Union[None, List[Num], List[List[Num]], np.ndarray, pd.DataFrame, pd.Series] = None, seed: Optional[int] = 123456, num_run: Optional[int] = 1, is_predict: Optional[bool] = True, n_jobs: Optional[int] = 1, backend: Optional[str] = None ) -> (Union[Arm, List[Arm], List[float], List[List[float]]], MAB): """Sets up a MAB model and runs the given configuration. Return list of predictions or prediction and the mab instance, when is_predict is true Return list of expectations or expectation and the mab instance, when is predict is false Calls the predict or predict_expectation method num_run number of times. """ # Model mab = MAB(arms, learning_policy, neighborhood_policy, seed, n_jobs, backend) # Train mab.fit(decisions, rewards, context_history) # Test if is_predict: # Return: prediction(s) and the MAB instance predictions = [mab.predict(contexts) for _ in range(num_run)] return predictions[0] if num_run == 1 else predictions, mab else: # Return: expectations(s) and the MAB instance expectations = [ mab.predict_expectations(contexts) for _ in range(num_run) ] return expectations[0] if num_run == 1 else expectations, mab
def test_popularity(self): list_of_arms = ['Arm1', 'Arm2'] decisions = ['Arm1', 'Arm1', 'Arm2', 'Arm1'] rewards = [20, 17, 25, 9] mab = MAB(list_of_arms, LearningPolicy.Popularity()) mab.fit(decisions, rewards) mab.predict() self.assertEqual("Arm2", mab.predict()) self.assertDictEqual( { 'Arm1': 0.38016528925619836, 'Arm2': 0.6198347107438016 }, mab.predict_expectations())
def test_partial_fit_indices(self): seed = 11 n_dimensions = 5 n_tables = 5 rng = np.random.RandomState(seed) contexts = np.array([[rng.rand() for _ in range(7)] for _ in range(10)]) decisions = np.array([rng.randint(0, 2) for _ in range(10)]) rewards = np.array([rng.rand() for _ in range(10)]) lsh = MAB(arms=[0, 1], learning_policy=LearningPolicy.Softmax(), neighborhood_policy=NeighborhoodPolicy.LSHNearest(n_dimensions, n_tables), seed=seed) lsh.fit(decisions, rewards, contexts) contexts2 = np.array([[rng.rand() for _ in range(7)] for _ in range(10)]) decisions2 = np.array([rng.randint(0, 2) for _ in range(10)]) rewards2 = np.array([rng.rand() for _ in range(10)]) lsh.partial_fit(decisions2, rewards2, contexts2) self.assertListEqual(lsh._imp.table_to_hash_to_index[0][4], [1, 15, 16]) self.assertListEqual(lsh._imp.table_to_hash_to_index[0][12], [9, 10, 11, 19])
def test_tables(self): seed = 11 n_dimensions = 5 n_tables = 5 rng = np.random.RandomState(seed) contexts = np.array([[rng.rand() for _ in range(7)] for _ in range(10)]) decisions = np.array([rng.randint(0, 2) for _ in range(10)]) rewards = np.array([rng.rand() for _ in range(10)]) lsh = MAB(arms=[0, 1], learning_policy=LearningPolicy.Softmax(), neighborhood_policy=NeighborhoodPolicy.LSHNearest(n_dimensions, n_tables), seed=seed) for i in range(n_tables): self.assertListEqual([], lsh._imp.table_to_plane[i]) lsh.fit(decisions, rewards, contexts) self.assertListAlmostEqual(list(lsh._imp.table_to_plane[0][0]), [1.74945474, -0.286073, -0.48456513, -2.65331856, -0.00828463]) self.assertListEqual(list(lsh._imp.table_to_hash_to_index[0].keys()), [1, 4, 5, 12, 13, 14, 15]) self.assertListEqual(lsh._imp.table_to_hash_to_index[0][1], [3]) self.assertListEqual(lsh._imp.table_to_hash_to_index[0][14], [0, 4, 8])
def test_rewards_inf_array(self): decisions = np.asarray([1, 1, 1, 2, 2, 2, 3, 3, 3]) rewards = np.asarray([0, 0, 0, 0, 0, 0, 1, 1, np.inf]) mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0)) with self.assertRaises(TypeError): mab.fit(decisions, rewards)
def test_rewards_null_list(self): decisions = [1, 1, 1, 2, 2, 2, 3, 3, 3] rewards = [0, 0, 0, 0, 0, 0, 1, 1, None] mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0)) with self.assertRaises(TypeError): mab.fit(decisions, rewards)
def test_invalid_decisions_rewards_length(self): decisions = [1, 1, 2, 2, 2, 3, 3] rewards = [0, 0, 0, 0, 0, 0, 1, 1, 1] mab = MAB([1, 2, 3], LearningPolicy.EpsilonGreedy(epsilon=0)) with self.assertRaises(ValueError): mab.fit(decisions, rewards)
train = scaler.fit_transform(train_df[['age', 'click_rate', 'subscriber']].values.astype('float64')) test = scaler.transform(test_df.values.astype('float64')) ######################################################## # Radius Neighborhood Policy with UCB1 Learning Policy ######################################################## # Radius contextual policy with radius equals to 5 and ucb1 learning with alpha 1.25 radius = MAB(arms=ads, learning_policy=LearningPolicy.UCB1(alpha=1.25), neighborhood_policy=NeighborhoodPolicy.Radius(radius=5)) # Learn from previous ads shown and revenues generated radius.fit(decisions=train_df['ad'], rewards=train_df['revenues'], contexts=train) # Predict the next best ad to show prediction = radius.predict(test) # Expectation of each ad based on learning from past ad revenues expectations = radius.predict_expectations(test) # Results print("Radius: ", prediction, " ", expectations) assert (prediction == [4, 4]) # Online update of model radius.partial_fit(decisions=prediction, rewards=test_df_revenue,
# Historical data of layouts decisions and corresponding rewards layouts = [1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1] revenues = [10, 17, 22, 9, 4, 0, 7, 8, 20, 9, 50, 5, 7, 12, 10] ################################### # Epsilon Greedy Learning Policy ################################### # Epsilon Greedy learning policy with random exploration set to 15% greedy = MAB(arms=options, learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0.15), seed=123456) # Learn from previous layouts decisions and revenues generated greedy.fit(decisions=layouts, rewards=revenues) # Predict the next best layouts decision prediction = greedy.predict() # Expected revenues of each layouts learnt from historical data based on epsilon greedy policy expectations = greedy.predict_expectations() # Results print("Epsilon Greedy: ", prediction, " ", expectations) assert (prediction == 1) # Additional historical data becomes available which allows _online learning additional_layouts = [1, 2, 1, 2] additional_revenues = [0, 12, 7, 19]
arm_to_scaler = {} for arm in arms: # Get indices for arm indices = np.where(decisions_train == arm) # Fit standard scaler scaler = StandardScaler() scaler.fit(contexts[indices]) arm_to_scaler[arm] = scaler ######################################################## # LinUCB Learning Policy ######################################################## # LinUCB learning policy with alpha 1.25 and n_jobs = -1 (maximum available cores) linucb = MAB(arms=arms, learning_policy=LearningPolicy.LinUCB( alpha=1.25, arm_to_scaler=arm_to_scaler), n_jobs=-1) # Learn from playlists shown and observed click rewards for each arm linucb.fit(decisions=decisions_train, rewards=rewards_train, contexts=contexts_train) # Predict the next best playlist to recommend prediction = linucb.predict(contexts_test) # Results print("LinUCB: ", prediction[:10])
scaler = StandardScaler() train = scaler.fit_transform(train_df[['age', 'click_rate', 'subscriber']].values.astype('float64')) test = scaler.transform(test_df.values.astype('float64')) ################################################## # Linear Upper Confidence Bound Learning Policy ################################################## # LinUCB learning policy with alpha 1.25 and l2_lambda 1 linucb = MAB(arms=ads, learning_policy=LearningPolicy.LinUCB(alpha=1.25, l2_lambda=1)) # Learn from previous ads shown and revenues generated linucb.fit(decisions=train_df['ad'], rewards=train_df['revenues'], contexts=train) # Predict the next best ad to show prediction = linucb.predict(test) # Expectation of each ad based on learning from past ad revenues expectations = linucb.predict_expectations(test) # Results print("LinUCB: ", prediction, " ", expectations) assert (prediction == [5, 2]) # Online update of model linucb.partial_fit(decisions=prediction, rewards=test_df_revenue,