/
MABAlgorithms2a.py
203 lines (154 loc) · 10.2 KB
/
MABAlgorithms2a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# -*- coding: utf-8 -*-
"""
@author: Wenbo Wang
[Wang2020] Wenbo Wang, Amir Leshem, Dusit Niyato and Zhu Han, "Decentralized Learning for Channel
Allocation inIoT Networks over Unlicensed Bandwidth as aContextual Multi-player Multi-armed Bandit Game"
License:
This program is licensed under the GPLv2 license. If you in any way use this code for research
that results in publications, please cite our original article listed above.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
"""
# This file defines and implements the following multi-player, multi-armed bandits algorithms (see also MABAlgorithms2.py).
#
# 1. [Tibrewal2019] Tibrewal, H., Patchala, S., Hanawal, M. K., and Darak, S. J. (2019). "Multiplayer multiarmed bandits for optimal
# assignment in heterogeneous networks," arXiv preprint arXiv:1901.03868,
#
# which is used as an additional reference for hopping-based algorithms such as (see also MABAlgorithms2.py)
# 1. [Sumit2019] Sumit J. Darak and Manjesh K. Hanawal, "Multi-player multi-armed bandits for stable allocation in
# heterogeneous ad-hoc networks", IEEE JSAC oct. 2019.
__author__ = "Wenbo Wang"
import numpy as np
from numfi import numfi
from scipy.optimize import linear_sum_assignment
from MABAlgorithms import MABAlgorithm
from Players2a import ESEPlayer
from loggingutils import info_logger
if __name__ == '__main__':
print("Warning: this script 'MABAlgorithms2a.py' is NOT executable...") # DEBUG
exit(0)
"""
Algorithm: Explore-Signal-Exploit (ESE)
"""
class ESE(MABAlgorithm):
"""
ESE implements the algorithm "Explore-Signal-Exploit (ESE)" proposed in
"Multiplayer multiarmed bandits for optimal assignment in heterogeneous networks," arXiv preprint arXiv:1901.03868,
also by the group of Sumit J. Darak and Manjesh K. Hanawal [Tibrewal2019].
The algorithm is featured by a protocol of exchanging the local arm-value estimation among players, through an additional
operation of "observation". After acquiring the estimated arm-value matrix, each player employ the Hungarian algrithm to
find the optimal arm to pull.
Strictly speaking, ESE is not a purely decentralized algorithm since it needs sequentially sending information from one player
to another. It may not achieve the same performance as the rest of algorithm within the same time T, due to the extra signaling phase.
"""
def __init__(self, param):
self.nbPlayer = param["nbPlayer"]
self.nbArm = param["nbArm"]
self.context_set = param["context_set"] # not used
self.delta_R = 0.1 if 'delta_R' not in param.keys() else param['delta_R'] # allowable prob. of non-orthogonal allocation
self.epsilon = 0.1
self.time = 0
# epoch parameters (well the reference paper does not provide a clear way of determining Ts and Tb, see their Alg. 2)
# we use a simplified appraoch
self.Tr = np.ceil(np.log(self.delta_R / self.nbArm) / (np.log(1-1/(4*self.nbArm)))) #rounds for random hopping (Tr), given by the configuration
self.Ts = np.ceil(8.*self.nbPlayer**2/(self.epsilon**2))
self.Tb = np.ceil(np.log2(4.*self.nbPlayer/self.epsilon))
self.current_epoch = 1
self.round_to_last_epoch = self.Tr + self.nbArm
self.agents = []
for playerID in range(self.nbPlayer):
player_param = {"context_set": self.context_set,
"nbArm": self.nbArm,
"nbPlayer": self.nbPlayer,
"playerID": playerID
}
self.agents.append(ESEPlayer(player_param))
info_logger().log_info('ESE random hopping phase length {}'.format(self.Tr + self.nbArm)) #debug
info_logger().log_info('ESE sequential hopping phase length Ts {}'.format(self.Ts)) #debug
info_logger().log_info('ESE signaling length Tb {}'.format(self.Tb)) #debug
epoch_length = self.nbArm * self.Ts + self.nbPlayer * self.nbArm * self.Tb + int(np.exp(self.current_epoch))
info_logger().log_info('ESE play epoch length {}'.format(epoch_length)) #debug
# --- Printing
def __str__(self):
return "Explore Signal Exploitn"
# --- functionalities
def reset(self, horizon=None):
self.time = 0
self.current_epoch = 1
self.round_to_last_epoch = self.Tr + self.nbArm
for agent in self.agents:
agent.reset()
def learn_policy(self, game_env, context=None, time=None):
# context is not used in ESE
(nbPlayer, nbArm) = np.shape(game_env)
assert nbPlayer == self.nbPlayer and nbArm == self.nbArm, "input does not match the stored environment parameters."
assert nbPlayer <= nbArm, "player number should be larger than or equal to arm number."
self.time = self.time + 1
pulls = np.zeros((nbPlayer, nbArm))
# there is a single random hopping phases in the game, arm-value is not learned in this phase
if self.time <= self.Tr:
for agentID in range(nbPlayer):
armID = self.agents[agentID].random_hop(None, time)
pulls[agentID][armID] = 1
collisions = self.resolve_collision(pulls)
elif self.time <= self.Tr + self.nbArm:
# this phase is to simulate the process for players to estimate the number of players
for agentID in range(nbPlayer):
pulls[agentID][self.agents[agentID].current_arm] = 1
collisions = self.resolve_collision(pulls)
elif self.time - self.round_to_last_epoch <= self.nbArm * self.Ts:
#exploration with sequential hopping
for agentID in range(nbPlayer):
armID = self.agents[agentID].sequential_hop(None, time)
pulls[agentID][armID] = 1
collisions = self.resolve_collision(pulls)
for agentID in range(nbPlayer):
self.agents[agentID].learn_arm_value(None, game_env[agentID,:], collisions)
elif self.time - self.round_to_last_epoch - self.nbArm * self.Ts <= self.nbPlayer * self.nbArm * self.Tb:
# signaling phase
# we don't need to simulate the complete signaling process, but we need to truncate the prevision of each player's estimated arm-values
# the original phase of signaling, according to [Tibrewal2019], does not contributes to the accumulated sum (experience) of arm evaluation,
# since for most of the time one player needs to either "observe" a certain arm without playing one, or stop to play to "broadcast" bit "0" to the other players.
# However, this incurs HUGE amount of regret since the signaling phase is too long to neglect.
# This may be a flaw of the original design of the ESE algorithm.
# In our implementation we assume that the players are still able to get a reward during signaling phase.
if self.time - self.round_to_last_epoch - self.nbArm * self.Ts == 1:
arm_matrix = np.zeros((self.nbPlayer, self.nbArm))
for agentID in range(nbPlayer):
arm_matrix[agentID, :] = self.agents[agentID].arm_score
truncated_arm_matrix = numfi(arm_matrix, bits_frac=int(np.log2(4*self.nbPlayer/self.epsilon)))
for agentID in range(nbPlayer):
self.agents[agentID].estimated_arm_matrix = truncated_arm_matrix
self.agents[agentID].estimated_arm_matrix[agentID, :] = self.agents[agentID].arm_score
# each player performs local Hungarian algorithm to derive its "optimal" policy
# the mehtod requires the number of rows (jobs) to be larger than that of columns (workers)
cost_matrix = np.negative(self.agents[agentID].estimated_arm_matrix.transpose())
# note that the cost_matrix is a transpose of the original matrix
col_ind, row_ind = linear_sum_assignment(cost_matrix)
# set player's policy
for ii in range(len(row_ind)):
playerID = row_ind[ii]
if playerID == agentID:
self.agents[agentID].policy = col_ind[ii]
pulls[agentID][col_ind[ii]] = 1
for agentID in range(self.nbPlayer):
armID = self.agents[agentID].policy
pulls[agentID][armID] = 1
collisions = self.resolve_collision(pulls)
elif self.time - self.round_to_last_epoch - self.nbArm * self.Ts - self.nbPlayer * self.nbArm * self.Tb <= int(np.exp(self.current_epoch)):
# exploitation phase
for agentID in range(nbPlayer):
armID = self.agents[agentID].exploit(context, self.time)
pulls[agentID][armID] = 1
collisions = self.resolve_collision(pulls)
if self.time == self.round_to_last_epoch + self.nbArm * self.Ts + self.nbPlayer * self.nbArm * self.Tb + int(np.exp(self.current_epoch)):
#update round number
self.round_to_last_epoch += self.nbArm * self.Ts + self.nbPlayer * self.nbArm * self.Tb + int(np.exp(self.current_epoch))
self.current_epoch = self.current_epoch + 1
info_logger().log_info('ESE play epoch {}'.format(self.current_epoch)) #debug
current_rewards = self.observe_distributed_payoff(game_env, collisions)
total_rewards = np.sum(current_rewards)
return pulls, total_rewards, current_rewards
# add other algorithms here
__all__ = ["ESE"]