def __init__(self, agent_num, mus=[0., 400.], sigmas=[100., 200.], action_low=0, action_high=10): Serializable.quick_init(self, locals()) self.game_name = 'gaussian_squeeze' self.mus = np.array(mus) self.sigmas = np.array(sigmas) self.agent_num = agent_num self.action_range = [action_low, action_high] lows = np.array( [np.array([action_low]) for _ in range(self.agent_num)]) highs = np.array( [np.array([action_high]) for _ in range(self.agent_num)]) self.action_spaces = MABox(lows=lows, highs=highs) self.observation_spaces = MADiscrete([1] * self.agent_num) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces)
def __init__(self, agent_num, game_name='pbeauty', p=0.67, reward_type='abs', action_low=-1., action_high=1.): Serializable.quick_init(self, locals()) self.agent_num = agent_num self.p = p self.game_name = game_name self.reward_type = reward_type self.action_range = [action_low, action_high] lows = np.array( [np.array([action_low]) for _ in range(self.agent_num)]) highs = np.array( [np.array([action_high]) for _ in range(self.agent_num)]) self.action_spaces = MABox(lows=lows, highs=highs) self.observation_spaces = MADiscrete([1] * self.agent_num) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces) self.t = 0 self.rewards = np.zeros((self.agent_num, ))
def __init__(self, game_name, agent_num, action_low=-10, action_high=10): Serializable.quick_init(self, locals()) self.game = game_name self.agent_num = agent_num # self.action_num = action_num self.action_range = [action_low, action_high] lows = np.array( [np.array([action_low]) for _ in range(self.agent_num)]) highs = np.array( [np.array([action_high]) for _ in range(self.agent_num)]) self.action_spaces = MABox(lows=lows, highs=highs) self.observation_spaces = MADiscrete([1] * self.agent_num) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces) self.t = 0 self.numplots = 0 self.payoff = {} if self.game == 'zero_sum': assert self.agent_num == 2 self.payoff[0] = lambda a1, a2: a1 * a2 self.payoff[1] = lambda a1, a2: -a1 * a2 elif self.game == 'trigonometric': assert self.agent_num == 2 self.payoff[0] = lambda a1, a2: np.cos(a2) * a1 self.payoff[1] = lambda a1, a2: np.sin(a1) * a2 elif self.game == 'mataching_pennies': assert self.agent_num == 2 self.payoff[0] = lambda a1, a2: (a1 - 0.5) * (a2 - 0.5) self.payoff[1] = lambda a1, a2: (a1 - 0.5) * (a2 - 0.5) elif self.game == 'rotational': assert self.agent_num == 2 self.payoff[0] = lambda a1, a2: 0.5 * a1 * a1 + 10 * a1 * a2 self.payoff[1] = lambda a1, a2: 0.5 * a2 * a2 - 10 * a1 * a2 elif self.game == 'wolf': assert self.agent_num == 2 def V(alpha, beta, payoff): u = payoff[(0, 0)] - payoff[(0, 1)] - payoff[(1, 0)] + payoff[ (1, 1)] return alpha * beta * u + alpha * (payoff[(0, 1)] - payoff[ (1, 1)]) + beta * (payoff[(1, 0)] - payoff[ (1, 1)]) + payoff[(1, 1)] payoff_0 = np.array([[0, 3], [1, 2]]) payoff_1 = np.array([[3, 2], [0, 1]]) self.payoff[0] = lambda a1, a2: V(a1, a2, payoff_0) self.payoff[1] = lambda a1, a2: V(a1, a2, payoff_1) elif self.game == 'ma_softq': assert self.agent_num == 2 h1 = 0.8 h2 = 1. s1 = 3. s2 = 1. x1 = -5. x2 = 5. y1 = -5. y2 = 5. c = 10. def max_f(a1, a2): f1 = h1 * (-(np.square(a1 - x1) / s1) - (np.square(a2 - y1) / s1)) f2 = h2 * (-(np.square(a1 - x2) / s2) - (np.square(a2 - y2) / s2)) + c return max(f1, f2) self.payoff[0] = lambda a1, a2: max_f(a1, a2) self.payoff[1] = lambda a1, a2: max_f(a1, a2) self.rewards = np.zeros((self.agent_num, ))
def __init__(self, world, reset_callback=None, reward_callback=None, observation_callback=None, info_callback=None, done_callback=None, shared_viewer=True): self.world = world self.agents = self.world.policy_agents # set required vectorized gym env property self.n = len(world.policy_agents) # scenario callbacks self.reset_callback = reset_callback self.reward_callback = reward_callback self.observation_callback = observation_callback self.info_callback = info_callback self.done_callback = done_callback # environment parameters self.discrete_action_space = True # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector self.discrete_action_input = False # if true, even the action is continuous, action will be performed discretely self.force_discrete_action = world.discrete_action if hasattr( world, 'discrete_action') else False # if true, every agent has the same reward self.shared_reward = world.collaborative if hasattr( world, 'collaborative') else False self.time = 0 self.agent_num = self.n # self.action_num = action_num # self.action_range = [action_low, action_high] # lows = np.array([np.array([action_low]) for _ in range(self.agent_num)]) # highs = np.array([np.array([action_high]) for _ in range(self.agent_num)]) obs_shapes = [] # configure spaces self.action_space = [] self.observation_space = [] for agent in self.agents: total_action_space = [] # physical action space if self.discrete_action_space: u_action_space = spaces.Discrete(world.dim_p * 2 + 1) else: u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p, ), dtype=np.float32) if agent.movable: total_action_space.append(u_action_space) # communication action space if self.discrete_action_space: c_action_space = spaces.Discrete(world.dim_c) else: c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c, ), dtype=np.float32) if not agent.silent: total_action_space.append(c_action_space) # total action space if len(total_action_space) > 1: # all action spaces are discrete, so simplify to MultiDiscrete action space if all([ isinstance(act_space, spaces.Discrete) for act_space in total_action_space ]): act_space = MultiDiscrete( [[0, act_space.n - 1] for act_space in total_action_space]) else: act_space = spaces.Tuple(total_action_space) self.action_space.append(act_space) else: self.action_space.append(total_action_space[0]) # observation space obs_dim = len(observation_callback(agent, self.world)) obs_shapes.append((obs_dim, )) self.observation_space.append( spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim, ), dtype=np.float32)) agent.action.c = np.zeros(self.world.dim_c) # simpified for non-comm game self.action_spaces = MABox(lows=[0] * self.agent_num, highs=[1] * self.agent_num, shapes=[(world.dim_p * 2 + 1, )] * self.agent_num) self.observation_spaces = MABox(lows=[-np.inf] * self.agent_num, highs=[+np.inf] * self.agent_num, shapes=obs_shapes) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces) self.action_range = [-10, 10] # rendering self.shared_viewer = shared_viewer if self.shared_viewer: self.viewers = [None] else: self.viewers = [None] * self.n self._reset_render()
def __init__(self, game, agent_num, action_num, payoff=None, repeated=False, max_step=25, memory=0, discrete_action=True, tuple_obs=True): self.game = game self.agent_num = agent_num self.action_num = action_num self.discrete_action = discrete_action self.tuple_obs = tuple_obs # self.action_range # self.action_space = np.array([range(action_num)] * self.agent_num) # self.state_space = np.array([range(1)] * self.agent_num) if self.discrete_action: self.action_spaces = MADiscrete([action_num] * self.agent_num) if memory == 0: self.observation_spaces = MADiscrete([1] * self.agent_num) elif memory == 1: self.observation_spaces = MADiscrete([5] * self.agent_num) else: self.action_range = [-1., 1.] lows = np.array([np.array([-1.]) for _ in range(self.agent_num)]) highs = np.array([np.array([1.]) for _ in range(self.agent_num)]) self.action_spaces = MABox(lows=lows, highs=highs) if memory == 0: self.observation_spaces = MADiscrete([1] * self.agent_num) elif memory == 1: lows = np.array( [np.array([-1., -1.]) for _ in range(self.agent_num)]) highs = np.array( [np.array([1., 1.]) for _ in range(self.agent_num)]) self.observation_spaces = MABox(lows=lows, highs=highs) self.env_specs = MAEnvSpec(self.observation_spaces, self.action_spaces) self.t = 0 self.repeated = repeated self.max_step = max_step self.memory = memory self.previous_action = 0 self.previous_actions = [] self.ep_rewards = np.zeros(2) if payoff is not None: payoff = np.array(payoff) assert payoff.shape == tuple([agent_num] + [action_num] * agent_num) self.payoff = payoff if payoff is None: self.payoff = np.zeros( tuple([agent_num] + [action_num] * agent_num)) if game == 'coordination_0_0': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[1, -1], [-1, -1]] self.payoff[1] = [[1, -1], [-1, -1]] if game == 'coordination_same_action_with_preference': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[2, 0], [0, 1]] self.payoff[1] = [[1, 0], [0, 2]] # '''payoff tabular of zero-sum game scenario. nash equilibrium: (Agenat1's action=0,Agent2's action=1)''' elif game == 'zero_sum_nash_0_1': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[5, 2], [-1, 6]] self.payoff[1] = [[-5, -2], [1, -6]] # '''payoff tabular of zero-sumgame scenario. matching pennies''' elif game == 'matching_pennies': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[1, -1], [-1, 1]] self.payoff[1] = [[-1, 1], [1, -1]] # elif game == 'matching_pennies_3': # assert self.agent_num == 3 # assert self.action_num == 2 # self.payoff[0]=[ # [ [1,-1], # [-1,1] ], # [ [1, -1], # [-1, 1]] # ] # self.payoff[1]=[ # [ [1,-1], # [1,-1] ], # [[-1, 1], # [-1, 1]] # ] # self.payoff[2] = [ # [[-1, -1], # [1, 1]], # [[1, 1], # [-1, -1]] # ] elif game == 'prison_lola': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[-1, -3], [0, -2]] self.payoff[1] = [[-1, 0], [-3, -2]] elif game == 'prison': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[3, 1], [4, 2]] self.payoff[1] = [[3, 4], [1, 2]] elif game == 'stag_hunt': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[4, 1], [3, 2]] self.payoff[1] = [[4, 3], [1, 2]] elif game == 'chicken': # snowdrift assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[3, 2], [4, 1]] self.payoff[1] = [[3, 4], [2, 1]] elif game == 'harmony': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[4, 3], [2, 1]] self.payoff[1] = [[4, 2], [3, 1]] elif game == 'wolf_05_05': assert self.agent_num == 2 assert self.action_num == 2 self.payoff[0] = [[0, 3], [1, 2]] self.payoff[1] = [[3, 2], [0, 1]] # \alpha, \beta = 0, 0.9, nash is 0.5 0.5 # Q tables given, matian best response, learn a nash e. elif game == 'climbing': assert self.agent_num == 2 assert self.action_num == 3 self.payoff[0] = [[11, -30, 0], [-30, 7, 6], [0, 0, 5]] self.payoff[1] = [[11, -30, 0], [-30, 7, 6], [0, 0, 5]] elif game == 'penalty': assert self.agent_num == 2 assert self.action_num == 3 self.payoff[0] = [[10, 0, 0], [0, 2, 0], [0, 0, 10]] self.payoff[1] = [[10, 0, 0], [0, 2, 0], [0, 0, 10]] # elif game == 'rock_paper_scissors': # assert self.agent_num == 2 # assert self.action_num == 3 # self.payoff[0] = [[0, -1, 1], # [1, 0, -1], # [-1, 1, 0] # ] # self.payoff[1] = [[0, 1, -1], # [-1, 0, 1], # [1, -1, 0] # ] self.rewards = np.zeros((self.agent_num, ))