def __init__(self, config, model, env): super(IntelligentRandomTrainerAgent, self).__init__(config=config, model=model, env=env) self.sample_count = 0 self.sess = tf.get_default_session() self.action_space = MultiDiscrete([[0, 1], [0, 1], [0, 1]])
class MyEnv(gym.Env): metadata = {'render.modes': ['human']} action_space = MultiDiscrete([3, 2, 4]) state_space = MultiDiscrete([3, 2, 4]) def __init__(self): pass def step(self, action): return self.state_space.sample(), 0, False, None def reset(self): return self.state_space.sample() def render(self): pass
def convert_to_gym_action_spaces(self): vector = [0] * (1 + len(self._parameter_registry)) vector[0] = self._num_actions for arg_type in self._parameter_registry: vector[1 + self._parameter_registry[arg_type]] = \ retrieve_parameter_size_vector(arg_type, self._feature_screen_size, self._feature_minimap_size) # print(vector) return MultiDiscrete(vector)
def __init__(self, nb_devices=3, d_max=4, e_max=4, u_max=4, f_max=3, c_max=3, m_max=10, parameters=None): self.parameters = parameters self.parameters['latency_threshold'] = local_parameters['latency_threshold'] self.nb_devices = nb_devices self.d_max = d_max self.e_max = e_max self.u_max = u_max self.f_max = f_max self.c_max = c_max self.m_max = m_max self.action_space = MultiDiscrete(np.array([self.d_max, self.e_max, self.u_max]) .repeat(nb_devices)[:2*self.nb_devices+1]) low_box = np.array([0, 0, 1]).repeat(self.nb_devices)[:2*self.nb_devices+1] high_box = np.array([self.f_max, self.c_max, self.m_max]).repeat(self.nb_devices)[:2*self.nb_devices+1] self.observation_space = Box(low=low_box, high=high_box, dtype=np.int32) # self.state = self.observation_space.sample() self.accumulate_data = np.zeros(self.nb_devices) self.penalties = 0 self.logger = { 'episode_reward': [], 'episode_steps': 0, 'epsilon': 0, 'average_reward': 0, 'energy': [], 'latency': [], 'payment': [], 'cumulative_data': np.zeros(self.nb_devices), 'actions': [], 'states': [], 'data_required': [], 'energy_required': [], 'latency_required': [], 'payment_required': [], } self.seed(123) self.reset()
class IntelligentRandomTrainerAgent(Agent): key_list = Config.load_json(file_path=CONFIG_KEY + '/intelligentRandomTrainerAgentKey.json') def __init__(self, config, model, env): super(IntelligentRandomTrainerAgent, self).__init__(config=config, model=model, env=env) self.sample_count = 0 self.sess = tf.get_default_session() self.action_space = MultiDiscrete([[0, 1], [0, 1], [0, 1]]) def predict(self, state, *args, **kwargs): res = self.action_space.sample() for i in range(3): prob = np.random.rand(1) if prob <= 0.5: res[i] = 1.0 else: res[i] = 0.2 self.sample_count += 1 return np.array(res) def update(self): # TODO finish your own update by using API with self.model pass # self.model.update() def store_one_sample(self, state, next_state, action, reward, done, *arg, **kwargs): # TODO store the one sample to whatever you want # self.model.store_one_sample(state=state, # next_state=next_state, # action=action, # reward=reward, # done=done) self.log_file_content.append({ 'STATE': np.array(state).tolist(), 'NEW_STATE': np.array(next_state).tolist(), 'ACTION': np.array(action).tolist(), 'REWARD': reward, 'DONE': done, 'INDEX': self.log_print_count }) self.log_print_count += 1 def init(self): # TODO init your agent and your model # this function will be called at the start of the whole train process # self.model.init() pass
def __init__(self, low_s, high_s, low_a, high_a, disc_s=False, disc_a=False): self.low_s, self.high_s = low_s, high_s self.low_a, self.high_a = low_a, high_a self.d_s, self.d_a = len(low_s), len(low_a) self.disc_s, self.disc_a = disc_s, disc_a self.n_a = int(high_a[0]) if disc_a else -1 self.n_s = int(high_s[0]) if disc_s else -1 self.observation_space = MultiDiscrete(high_s) self.action_space = Discrete(self.n_a)
def __init__(self, nbombs: int = 10, shape: Tuple[int, int] = (8, 8)): assert min(shape) > 0 and nbombs > 0 assert shape[0] * shape[1] > nbombs + 8 self.observation_space = Box(low=-1, high=8, shape=shape, dtype=np.int8) self.action_space = MultiDiscrete(shape) self.nbombs = nbombs self.shape = shape self._bombs = None self._nnbombs = None self._layout = None # type: Optional[np.ndarray] self._initialized = False self.done = False # Set to False at reset() self.reset()
def action_space(self): """ return: Tuple(Box, MultiDiscrete) 0: direction -1 1: direction 0 2: direction 1 """ max_decel = self.env_params.additional_params["max_decel"] max_accel = self.env_params.additional_params["max_accel"] lb = [ -abs(max_decel) ] * self.initial_vehicles.num_rl_vehicles # lower bound of acceleration ub = [ max_accel ] * self.initial_vehicles.num_rl_vehicles # upper bound of acceleration lc_b = [ 3 ] * self.initial_vehicles.num_rl_vehicles # boundary of lane change direction return Tuple((Box(np.array(lb), np.array(ub), dtype=np.float32), MultiDiscrete(np.array(lc_b))))
def _generate_sample_space(self, base_space: Union[None, Box, Discrete], num: int) -> Optional[Union[Box, Discrete]]: # the possibility of this space , got {type(base_space)}aving nothing if num == 0: return None if isinstance(base_space, Box): return Box( low=np.array(max(1, num) * [base_space.low]), high=np.array(max(1, num) * [base_space.high]), shape=(num, *base_space.shape), dtype=base_space.dtype, seed=self._np_random, ) elif isinstance(base_space, Discrete): return MultiDiscrete(nvec=[base_space.n] * num, seed=self._np_random) elif base_space is None: return None else: raise AssertionError( f"Only Box and Discrete can be accepted as a base_space, got {type(base_space)}, you should not have gotten this error." )
class BlockFLEnv(gym.Env): def __init__(self, nb_devices=3, d_max=4, e_max=4, u_max=4, f_max=3, c_max=3, m_max=10, parameters=None): self.parameters = parameters self.parameters['latency_threshold'] = local_parameters['latency_threshold'] self.nb_devices = nb_devices self.d_max = d_max self.e_max = e_max self.u_max = u_max self.f_max = f_max self.c_max = c_max self.m_max = m_max self.action_space = MultiDiscrete(np.array([self.d_max, self.e_max, self.u_max]) .repeat(nb_devices)[:2*self.nb_devices+1]) low_box = np.array([0, 0, 1]).repeat(self.nb_devices)[:2*self.nb_devices+1] high_box = np.array([self.f_max, self.c_max, self.m_max]).repeat(self.nb_devices)[:2*self.nb_devices+1] self.observation_space = Box(low=low_box, high=high_box, dtype=np.int32) # self.state = self.observation_space.sample() self.accumulate_data = np.zeros(self.nb_devices) self.penalties = 0 self.logger = { 'episode_reward': [], 'episode_steps': 0, 'epsilon': 0, 'average_reward': 0, 'energy': [], 'latency': [], 'payment': [], 'cumulative_data': np.zeros(self.nb_devices), 'actions': [], 'states': [], 'data_required': [], 'energy_required': [], 'latency_required': [], 'payment_required': [], } self.seed(123) self.reset() def get_penalties(self, scale): return self.penalties * scale def check_action(self, action): self.penalties = 0 state = np.copy(self.state) capacity_array = np.copy(state[self.nb_devices:2*self.nb_devices]) data_action_array = np.copy(action[0:self.nb_devices]) energy_action_array = np.copy(action[self.nb_devices:2*self.nb_devices]) mining_rate_array = np.full(self.nb_devices, self.parameters['mining_rate_zero'] + action[-1], dtype=int) cpu_cycles = self.get_cpu_cycles(energy_action_array, data_action_array) for i in range(len(energy_action_array)): if energy_action_array[i] > capacity_array[i]: # energy_action_array[i] = capacity_array[i] energy_action_array[i] = 0 self.penalties += 0 for j in range(len(cpu_cycles)): if cpu_cycles[j] == 0: data_action_array[j] = 0 energy_action_array[j] = 0 self.penalties += 0 corrected_action = np.array([data_action_array, energy_action_array, mining_rate_array]).flatten()[:2*self.nb_devices+1] return corrected_action def get_cpu_cycles(self, energy, data): cpu_cycles = np.zeros(len(energy)) cpu_cycles_max = self.parameters['sigma'] * self.state[:self.nb_devices] for i in range(len(data)): if data[i] != 0 and energy[i] != 0: cpu_cycles[i] = np.sqrt(self.parameters['delta'] * energy[i] / (self.parameters['tau'] * self.parameters['nu'] * data[i])) if cpu_cycles[i] > cpu_cycles_max[i]: cpu_cycles[i] = 0 else: cpu_cycles[i] = 0 # print(cpu_cycles) return cpu_cycles def calculate_latency(self, action): data = np.copy(action[:self.nb_devices]) energy = np.copy(action[self.nb_devices:2 * self.nb_devices]) mining_rate = self.parameters['mining_rate_zero'] + action[-1] cpu_cycles = self.get_cpu_cycles(energy, data) training_latency = np.max([self.parameters['nu'] * data[k] / cpu_cycles[k] if cpu_cycles[k] != 0 else 0 for k in range(len(data))]) block_queue_latency = self.parameters['cross_verify_latency'] + self.parameters['block_prop_latency'] + \ self.parameters['blk_latency_scale'] * self.nprandom.exponential(1 / (mining_rate - self.parameters['block_arrival_rate'])) latency = self.parameters['transmission_latency'] + block_queue_latency +\ self.parameters['training_latency_scale'] * training_latency # print('L_tr: {}, L_tx: {}, L_blk: {}'.format(training_latency, parameters['transmission_latency'], # block_queue_latency)) return latency def get_reward(self, action): data = np.copy(action[:self.nb_devices]) energy = np.copy(action[self.nb_devices:2 * self.nb_devices]) cumulative_data = np.sum([self.parameters['data_qualities'][k] * data[k] for k in range(self.nb_devices)]) payment = self.parameters['training_price'] * cumulative_data + self.parameters['blk_price'] / np.log(1 + self.state[-1]) latency = self.calculate_latency(action) penalties = self.get_penalties(self.parameters['penalty_scale']) reward = self.parameters['alpha_D'] * cumulative_data / self.parameters['data_threshold'] \ - self.parameters['alpha_E'] * np.sum(energy) / self.parameters['energy_threshold'] \ - self.parameters['alpha_L'] * latency / self.parameters['latency_threshold'] \ - self.parameters['alpha_I'] * payment / self.parameters['payment_threshold'] \ - penalties if payment / self.parameters['payment_threshold'] > 1: print('data: {}, energy: {}, latency: {}, payment: {}'.format(cumulative_data / self.parameters['data_threshold'], np.sum(energy) / self.parameters['energy_threshold'], latency / self.parameters['latency_threshold'], payment / self.parameters['payment_threshold'])) self.logger['latency'].append(latency) self.logger['energy'].append(np.sum(energy)) self.logger['payment'].append(payment) self.logger['cumulative_data'] = np.add(self.logger['cumulative_data'], data) return reward, cumulative_data / self.parameters['data_threshold'], np.sum(energy) / self.parameters['energy_threshold'],\ latency / self.parameters['latency_threshold'], payment / self.parameters['payment_threshold'] def state_transition(self, state, action): capacity_array = np.copy(state[self.nb_devices:2*self.nb_devices]) energy_array = np.copy(action[self.nb_devices:2*self.nb_devices]) mining_rate = self.parameters['mining_rate_zero'] + action[-1] charging_array = self.nprandom.poisson(1, size=len(energy_array)) cpu_shares_array = self.nprandom.randint(low=0, high=self.f_max+1, size=self.nb_devices) next_capacity_array = np.zeros(len(capacity_array)) block_queue_state = self.nprandom.geometric(1 - self.parameters['lambda'] / mining_rate, size=self.nb_devices) for i in range(len(next_capacity_array)): next_capacity_array[i] = min(capacity_array[i] - energy_array[i] + charging_array[i], self.c_max) next_state = np.array([cpu_shares_array, next_capacity_array, block_queue_state], dtype=np.int32).flatten() self.state = next_state[:1+2*self.nb_devices] return self.state def step(self, action): assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) corrected_action = self.check_action(action) # corrected_action = action data = np.copy(corrected_action[:self.nb_devices]) state = np.copy(self.state) next_state = self.state_transition(state, corrected_action) reward = self.get_reward(corrected_action) self.accumulate_data = np.add(self.accumulate_data, data) # print(self.get_reward(corrected_action)[1:]) self.logger['episode_steps'] += 1 self.logger['episode_reward'].append(reward[0]) self.logger['actions'].append(action) self.logger['states'].append(state) self.logger['data_required'].append(reward[1]) self.logger['energy_required'].append(reward[2]) self.logger['latency_required'].append(reward[3]) self.logger['payment_required'].append(reward[4]) if np.sum(self.accumulate_data) >= self.parameters['cumulative_data_threshold']: done = True self.logger['average_reward'] = np.mean(self.logger['episode_reward']) else: done = False # self.state = next_state return next_state, reward[0], done, {} def reset(self): self.accumulate_data = np.zeros(self.nb_devices) self.penalties = 0 self.logger = { 'episode_reward': [], 'episode_steps': 0, 'epsilon': 0, 'average_reward': 0, 'energy': [], 'latency': [], 'payment': [], 'cumulative_data': np.zeros(self.nb_devices), 'actions': [], 'states': [], 'data_required': [], 'energy_required': [], 'latency_required': [], 'payment_required': [], } cpu_shares_init = self.nprandom.randint(self.f_max + 1, size=self.nb_devices) capacity_init = self.nprandom.randint(self.c_max + 1, size=self.nb_devices) mempool_init = np.full(self.nb_devices, 1) state = np.array([cpu_shares_init, capacity_init, mempool_init]).flatten() state = state[:2 * self.nb_devices + 1] # state = self.observation_space.sample() self.state = state return state def seed(self, seed=None): self.nprandom, seed = seeding.np_random(seed) return [seed]