def __init__(self, HOST, PORT, debug=False): self.minerEnv = MinerEnv(HOST, PORT) self.minerEnv.start() self.action_space = spaces.Discrete(5) self.observation_space = spaces.Discrete(198) self.debug = debug self.view = None self.ob = None self.state = self.minerEnv.state
def __init__(self, host, port, debug = False): super(TFAgentsMiner, self).__init__() self.miner_env= MinerEnv(host, port) self.miner_env.start() self.debug = debug self._action_spec = array_spec.BoundedArraySpec(shape = (), dtype = np.int32, minimum = 0, maximum = 5, name = 'action') self._observation_spec = array_spec.BoundedArraySpec(shape = (MAP_MAX_X*5,MAP_MAX_Y*5,6), dtype = np.float32, name = 'observation')
def __init__(self, HOST, PORT, debug=False): self.minerEnv = MinerEnv(HOST, PORT) self.minerEnv.start() self.action_space = spaces.Discrete(6) self.observation_space = spaces.Discrete(198) self.action = None self.reward = None self.ob = None self.view = None self.state = self.minerEnv.state self.maxstep = self.minerEnv.state.mapInfo.maxStep self.img_array = []
def __init__(self, env_config): self.env = MinerEnv(None, None) self.env.start() self.state = self.env.state self.width = 21 self.height = 9 self.action_space = Discrete(6) self.observation_space = Tuple(( Box(low=0, high=np.inf, shape=(self.width, self.height, 1)), Box(low=-np.inf, high=np.inf, shape=(4,)), Box(low=-2, high=1, shape=(4,)), ))
class MultiAgentsEnv(MultiAgentEnv): def __init__(self, env_config): self.env = MinerEnv(None, None) self.env.start() self.state = self.env.state self.width = 21 self.height = 9 self.action_space = Discrete(6) self.observation_space = Tuple(( Box(low=0, high=np.inf, shape=(self.width, self.height, 1)), Box(low=-np.inf, high=np.inf, shape=(4,)), Box(low=-2, high=1, shape=(4,)), )) def reset(self): map_id = np.random.randint(1, 7) pos_x = np.random.randint(self.width) pos_y = np.random.randint(self.height) number_of_players = np.random.randint(1, 5) self.env.send_map_info(map_id, pos_x, pos_y, number_of_players=number_of_players) self.env.reset() ids = list(range(2, 1 + number_of_players)) self.bots = [] if number_of_players > 1: for _ in range(np.random.randint(1, number_of_players)): if random.choice([1, 1, 2]) == 1: self.bots.append(Bot1(ids.pop(random.choice(range(len(ids)))))) else: self.bots.append(Bot2(ids.pop(random.choice(range(len(ids)))), gamma=random.choice([1.0]))) return self.get_state() def step(self, action): for bot in self.bots: action[str(bot.id)] = bot.compute_action(self.state) self.env.step(action) return self.get_state(), self.get_reward(), self.get_done(), {} def get_state(self): # Building the map view = np.zeros([self.width, self.height, 1], dtype=float) for obstacle in self.state.mapInfo.obstacles: obstacle_type = obstacle['type'] x = obstacle['posx'] y = obstacle['posy'] value = obstacle['value'] if obstacle_type == 3: if value == -5: obstacle_type = 4 elif value == -20: obstacle_type = 5 elif value == -40: obstacle_type = 6 elif value == -100: obstacle_type = 7 else: raise Exception('No such obstacle') view[x, y, 0] = obstacle_type for gold in self.state.mapInfo.golds: gold_amount = gold['amount'] x = gold['posx'] y = gold['posy'] if gold_amount > 0: view[x, y, 0] = min(7 + math.ceil(gold_amount / 50), 37) return { str(player_id): self.get_single_player_state(np.copy(view), player_id) for player_id in self.state.players.keys() } def get_single_player_state(self, view, playerId): players_pos = np.full(4, -1, dtype=int) energies = np.zeros(4) i = 1 for player_id, player_state in self.state.players.items(): x = player_state['posx'] y = player_state['posy'] if x < view.shape[0] and y < view.shape[1]: if player_id == playerId: players_pos[0] = x * self.height + y energies[0] = player_state['energy'] / 50 else: players_pos[i] = x * self.height + y energies[i] = player_state['energy'] / 50 i += 1 return ( view, players_pos, energies, ) def get_reward(self): return { str(player_id): self.get_single_player_reward(player_id) for player_id in self.state.players.keys() } def get_single_player_reward(self, playerId): # Calculate reward reward = 0 player = self.state.players[playerId] player_pre = self.state.players_pre[playerId] score_action = player['score'] - player_pre['score'] if score_action > 0: reward += score_action / 50 consumed_energy = player_pre['energy'] - player['energy'] if Action(player['lastAction']) == Action.CRAFT and consumed_energy == 10: reward += -1.0 if player['status'] == self.state.STATUS_ELIMINATED_OUT_OF_ENERGY: reward += -1.0 if Action(player['lastAction']) == Action.FREE and player_pre['energy'] == 50: reward += -0.1 return reward def get_done(self): done = {'__all__': False} if all(map(lambda player_state: player_state['status'] != 0, self.state.players.values())): done['__all__'] = True return done
# Create header for saving learning file """ now = datetime.datetime.now() #Getting the latest datetime header = ["Ep", "Step", "Reward", "Total_reward", "Action", "Epsilon", "Done", "Termination_Code"] #Defining header for the save file filename = "Data/data_" + now.strftime("%Y%m%d-%H%M") + ".csv" with open(filename, 'w') as f: pd.DataFrame(columns=header).to_csv(f, encoding='utf-8', index=False, header=True) """ # Initialize environment HOST = "localhost" PORT = 1111 if len(sys.argv) == 3: HOST = str(sys.argv[1]) PORT = int(sys.argv[2]) minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #train = False #The variable is used to indicate that the epsilon starts to decrease. #Training Process if __name__ == '__main__': gnet = Net(INPUTNUM, ACTIONNUM) # global network gnet.share_memory() # share the global parameters in multiprocessing opt = SharedAdam(gnet.parameters(), lr=1e-4, betas=(0.95, 0.999)) # global optimizer global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() # parallel training workers = [
BATCH_SIZE = 32 #The number of experiences for each replay MEMORY_SIZE = 100000 #The size of the batch for storing experiences SAVE_NETWORK = 100 # After this number of episodes, the DQN model is saved for testing later. INITIAL_REPLAY_SIZE = 1000 #The number of experiences are stored in the memory batch before starting replaying INPUTNUM = 198 #The number of input values for the DQN model ACTIONNUM = 6 #The number of actions output from the DQN model MAP_MAX_X = 21 #Width of the Map MAP_MAX_Y = 9 #Height of the Map # Initialize a DQN model and a memory batch for storing experiences DQNAgent = DQN(INPUTNUM, ACTIONNUM) memory = Memory(MEMORY_SIZE) # Initialize environment minerEnv = MinerEnv( HOST, PORT ) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease. #Training Process #the main part of the deep-q learning agorithm for episode_i in range(0, N_EPISODE): try: # Choosing a map in the list mapID = np.random.randint( 1, 6) #Choosing a map ID from 5 maps in Maps folder randomly posID_x = np.random.randint( MAP_MAX_X ) #Choosing a initial position of the DQN agent on X-axes randomly posID_y = np.random.randint(
policy = TD3_conv.TD3(**kwargs) policy_file = "TD3_conv_Miner_0_2_scale3" policy.load(f"./models_TD3_tensor/{policy_file}") print("Loaded model from disk") status_map = { 0: "STATUS_PLAYING", 1: "STATUS_ELIMINATED_WENT_OUT_MAP", 2: "STATUS_ELIMINATED_OUT_OF_ENERGY", 3: "STATUS_ELIMINATED_INVALID_ACTION", 4: "STATUS_STOP_EMPTY_GOLD", 5: "STATUS_STOP_END_STEP" } total_reward = 0 try: # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() # Connect to the game mapID = np.random.randint( 1, 6) #Choosing a map ID from 5 maps in Maps folder randomly posID_x = np.random.randint( MAP_MAX_X ) #Choosing a initial position of the DQN agent on X-axes randomly posID_y = np.random.randint( MAP_MAX_Y ) #Choosing a initial position of the DQN agent on Y-axes randomly #Creating a request for initializing a map, initial position, the initial energy, and the maximum number of steps of the DQN agent request = ("map" + str(mapID) + "," + str(posID_x) + "," + str(posID_y) + ",50,100") #Send the request to the game environment (GAME_SOCKET_DUMMY.py) minerEnv.send_map_info(request) minerEnv.reset()
json_file.close() DQNAgent = model_from_json(loaded_model_json) # load weights into new model DQNAgent.load_weights("DQNmodel_Test.h5") print("Loaded model from disk") status_map = { 0: "STATUS_PLAYING", 1: "STATUS_ELIMINATED_WENT_OUT_MAP", 2: "STATUS_ELIMINATED_OUT_OF_ENERGY", 3: "STATUS_ELIMINATED_INVALID_ACTION", 4: "STATUS_STOP_EMPTY_GOLD", 5: "STATUS_STOP_END_STEP" } try: # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() # Connect to the game minerEnv.reset() state_map, state_users = minerEnv.get_state( initial_flag=True) ##Getting an initial state while not minerEnv.check_terminate(): try: if minerEnv.state.mapInfo.gold_amount(minerEnv.state.x, minerEnv.state.y) > 0: if minerEnv.state.energy <= 5: action = 4 else: action = 5 else: action = np.argmax( DQNAgent.predict({
class MinerGymEnv(gym.Env): def __init__(self, HOST, PORT, debug=False): self.minerEnv = MinerEnv(HOST, PORT) self.minerEnv.start() self.action_space = spaces.Discrete(6) self.observation_space = spaces.Discrete(198) self.action = None self.reward = None self.ob = None self.view = None self.state = self.minerEnv.state self.maxstep = self.minerEnv.state.mapInfo.maxStep self.img_array = [] def step(self, action): self.minerEnv.step(str(action)) reward = self.get_reward() ob = self.get_state() episode_over = self.check_terminate() self.ob = ob self.action = action self.reward = reward return ob, reward, episode_over, { 'score': self.minerEnv.state.score, 'action': action } def render(self, mode='human'): img = cv2.imread("/content/map2.png") for player in self.minerEnv.state.players: if player['playerId'] in [1, 2]: id = player['playerId'] score = player['score'] engergy = player['energy'] free_count = player['freeCount'] last_action = ACTIONS[player['lastAction']] # last_action = ACTIONS[self.action] status = player['status'] x = player['posx'] y = player['posy'] if x >= 21 or y >= 9: continue pos_img = (36 + x * 71, 36 + y * 71) cv2.circle(img, pos_img, 16, COLORS_ID[id], -1) self.img_array.append(img) def reset(self): mapID = 1 posID_x = np.random.randint(21) posID_y = np.random.randint(9) request = ("map" + str(mapID) + "," + str(posID_x) + "," + str(posID_y) + ",50,100") self.minerEnv.send_map_info(request) self.minerEnv.reset() state = self.get_state() return state def check_terminate(self): return self.minerEnv.check_terminate() def get_reward(self): return self.minerEnv.get_reward() def get_state(self): view = np.zeros( [self.state.mapInfo.max_x + 1, self.state.mapInfo.max_y + 1], dtype=int) for i in range(self.state.mapInfo.max_x + 1): for j in range(self.state.mapInfo.max_y + 1): if self.state.mapInfo.get_obstacle(i, j) == TreeID: # Tree view[i, j] = -TreeID if self.state.mapInfo.get_obstacle(i, j) == TrapID: # Trap view[i, j] = -TrapID if self.state.mapInfo.get_obstacle(i, j) == SwampID: # Swamp view[i, j] = -SwampID if self.state.mapInfo.gold_amount(i, j) > 0: view[i, j] = self.state.mapInfo.gold_amount(i, j) self.view = view return self.minerEnv.get_state() def close(self): self.minerEnv.end() def start(self): self.minerEnv.start()
policy = newDDPG.DDPG(**kwargs) if args.policy == "TD3_conv": policy = TD3_conv.TD3(**kwargs) if args.policy == "A2C": policy = A2C.A2C(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim, max_size=int(10000)) # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #init environment # Evaluate untrained policy #evaluations = [eval_policy(policy, minerEnv)] train = False best_score = {1: 0, 2: 0, 3: 0, 4: 0} for episode_i in range(0, N_EPISODE): # Reset environment mapID = request_to_env(minerEnv, train) # init environment game minerEnv.reset() #action = policy.select_action(np.array(state)) state = minerEnv.get_state_tensor2(scale_map) done = False
json_file.close() DQNAgent = model_from_json(loaded_model_json) # load weights into new model DQNAgent.load_weights("RLModelSample.h5") print("Loaded model from disk") status_map = { 0: "STATUS_PLAYING", 1: "STATUS_ELIMINATED_WENT_OUT_MAP", 2: "STATUS_ELIMINATED_OUT_OF_ENERGY", 3: "STATUS_ELIMINATED_INVALID_ACTION", 4: "STATUS_STOP_EMPTY_GOLD", 5: "STATUS_STOP_END_STEP" } try: # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() # Connect to the game minerEnv.reset() s = minerEnv.get_state() ##Getting an initial state while not minerEnv.check_terminate(): try: action = np.argmax(DQNAgent.predict(s.reshape( 1, len(s)))) # Getting an action from the trained model print("next action = ", action) minerEnv.step( str(action) ) # Performing the action in order to obtain the new state s_next = minerEnv.get_state() # Getting a new state s = s_next except Exception as e: import traceback
mem_size=50000, eps_min=0.1, replace=1000, eps_dec=1e-5, chkpt_dir="models/", algo="dqnagent", env_name="minerai", gamma=0.99, epsilon=1, lr=0.00001, ) if load_checkpoint: DQNAgent.load_models() # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() fname = (DQNAgent.algo + "_" + DQNAgent.env_name + "_lr" + str(DQNAgent.lr) + "_" + str(N_EPISODE) + "games") figure_file = "plots/" + fname + ".png" n_steps = -100 scores, eps_history, steps_array = [], [], [] # Training Process # the main part of the deep-q learning agorithm best_score = -100 for episode_i in range(0, N_EPISODE): try:
kwargs1 = { "state_dim": 28, "action_dim": 6, "max_action": 1.0, } policy_bot = TD3_bot.TD3(**kwargs1) policy_bot_file = "TD3_Miner_0_2" policy_bot.load(f"./ref_policy/models/{policy_bot_file}") ##### replay_buffer = utils.ReplayBuffer(state_dim, action_dim=1, max_size=int(10000)) # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #init environment # Evaluate untrained policy #evaluations = [eval_policy(policy, minerEnv, scale_map)] train = False best_score = {1: 0, 2: 0, 3: 0, 4: 0} score, best_sc = [], 0 for episode_i in range(0, N_EPISODE): # Reset environment mapID = request_to_env(minerEnv, True) # init environment game minerEnv.reset() #action = policy.select_action(np.array(state)) #state = np.reshape([minerEnv.get_state_tensor(scale_map)], (6, INPUTNUM[1], INPUTNUM[2]))
class TFAgentsMiner(pyenv.PyEnvironment): def __init__(self, host, port, debug = False): super(TFAgentsMiner, self).__init__() self.miner_env= MinerEnv(host, port) self.miner_env.start() self.debug = debug self._action_spec = array_spec.BoundedArraySpec(shape = (), dtype = np.int32, minimum = 0, maximum = 5, name = 'action') self._observation_spec = array_spec.BoundedArraySpec(shape = (MAP_MAX_X*5,MAP_MAX_Y*5,6), dtype = np.float32, name = 'observation') def action_spec(self): return self._action_spec def observation_spec(self): return self._observation_spec def _reset(self): mapID = np.random.randint(1, 6) posID_x = np.random.randint(MAP_MAX_X) posID_y = np.random.randint(MAP_MAX_Y) request = ("map" + str(mapID) + "," + str(posID_x) + "," + str(posID_y) + ",50,100") self.miner_env.send_map_info(request) self.miner_env.reset() observation = self.miner_env.get_state() return time_step.restart(observation) def _log_info(self): info = self.miner_env.socket # print(f'Map size:{self.info.user.max_x, self.miner_env.state.mapInfo.max_y}') print(f"Self - Pos ({info.user.posx}, {info.user.posy}) - Energy {info.user.energy} - Status {info.user.status}") for bot in info.bots: print(f"Enemy - Pos ({bot.info.posx}, {bot.info.posy}) - Energy {bot.info.energy} - Status {bot.info.status}") def _step(self, action): if self.debug: self._log_info() self.miner_env.step(str(action)) observation = self.miner_env.get_state() reward = self.miner_env.get_reward() if not self.miner_env.check_terminate(): return time_step.transition(observation, reward) else: self.reset() return time_step.termination(observation, reward) def render(self): pass
n_actions=ACTION_NUM, mem_size=MEMORY_SIZE, eps_min=0.05, batch_size=BATCH_SIZE, replace=10000, eps_dec=1e-5, chkpt_dir=FILE_PATH + '/weights/', algo='DDQNAgent', env_name='miner') load_checkpoint = False if load_checkpoint: agent.load_models() n_games = 20000 minerEnv = MinerEnv( HOST, PORT ) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game path = FILE_PATH + '/Maps/' fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ + str(n_games) + 'games' figure_file = FILE_PATH + '/' + fname + '.png' best_score = -np.inf n_steps = 0 scores, eps_history, steps_array = [], [], [] for episode_i in range(n_games): done = False
class MinerGymEnv(gym.Env): def __init__(self, HOST, PORT, debug=False): self.minerEnv = MinerEnv(HOST, PORT) self.minerEnv.start() self.action_space = spaces.Discrete(5) self.observation_space = spaces.Discrete(198) self.debug = debug self.view = None self.ob = None self.state = self.minerEnv.state def print(self, message): if self.debug: print(message) def draw_text(self, mat, text): cv2_im_rgb = cv2.cvtColor(mat, cv2.COLOR_BGR2RGB) pil_im = Image.fromarray(cv2_im_rgb) draw = ImageDraw.Draw(pil_im) draw.text((10, 10), text, font=font) cv2_im_processed = cv2.cvtColor(np.array(pil_im), cv2.COLOR_RGB2BGR) return cv2_im_processed # cv2.imwrite("result.png", cv2_im_processed) def step(self, action): self.minerEnv.step(str(action)) self.status = self.minerEnv.get_state() reward = self.get_reward() ob = self.get_state() episode_over = self.check_terminate() self.ob = ob if self.debug: self.render() return ob, reward, episode_over, {} def check_terminate(self): return self.minerEnv.check_terminate() def send_map_info(self, request): return self.minerEnv.send_map_info(request) def get_state(self): view = np.zeros( [self.state.mapInfo.max_x + 1, self.state.mapInfo.max_y + 1], dtype=int) for i in range(self.state.mapInfo.max_x + 1): for j in range(self.state.mapInfo.max_y + 1): if self.state.mapInfo.get_obstacle(i, j) == TreeID: # Tree view[i, j] = -TreeID if self.state.mapInfo.get_obstacle(i, j) == TrapID: # Trap view[i, j] = -TrapID if self.state.mapInfo.get_obstacle(i, j) == SwampID: # Swamp view[i, j] = -SwampID if self.state.mapInfo.gold_amount(i, j) > 0: view[i, j] = self.state.mapInfo.gold_amount(i, j) self.view = view return self.minerEnv.get_state() def reset(self): mapID = np.random.randint(1, 6) posID_x = np.random.randint(MAP_MAX_X) posID_y = np.random.randint(MAP_MAX_Y) request = ("map" + str(mapID) + "," + str(posID_x) + "," + str(posID_y) + ",50,100") self.minerEnv.send_map_info(request) state = self.get_state() self.minerEnv.reset() return state def render(self, mode='human'): if self.view is None: return h, w = self.view.shape mat = np.zeros(shape=(h, w, 3), dtype=np.uint8) mat[self.view == -1, 1] = 153 mat[self.view == -3, 1] = 53 mat[self.view == -2, 0] = 153 mat[self.view > 0, 1:3] = np.array( [self.view[self.view > 0], self.view[self.view > 0]]).T remaining_gold = sum(self.view[self.view > 0].flatten()) t = PrettyTable(['ID', 'Score', 'Engergy', 'Free count']) for player in self.minerEnv.state.players: id = player['playerId'] score = player['score'] engergy = player['energy'] free_count = player['freeCount'] x = player['posx'] y = player['posy'] if x >= h or y >= w: continue if player['playerId'] == self.minerEnv.state.id: mat[x, y, :] = 255 t.add_row(['player', score, engergy, free_count]) else: mat[x, y, 2] = 153 t.add_row(['bot {}'.format(id), score, engergy, free_count]) blank = np.zeros(shape=(h * 38, w * 38, 3), dtype=np.uint8) z = 'Remaining gold: {}\n'.format(remaining_gold) z += t.get_string() blank = self.draw_text(mat=blank, text=z) mat = cv2.resize(mat, (w * 38, h * 38), interpolation=cv2.INTER_AREA) mat = np.concatenate((mat, blank), 1) cv2.imshow('game view', mat) cv2.waitKey(1) def get_reward(self): return self.minerEnv.get_reward() def close(self): self.minerEnv.end() def start(self): return self.minerEnv.start()
policy = DDPG.DDPG(**kwargs) if args.policy == "newDDPG": policy = newDDPG.DDPG(**kwargs) if args.policy == "newTD3": policy = newTD3.TD3(**kwargs) if args.policy == "A2C": policy = A2C.A2C(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim) # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() #init environment # Evaluate untrained policy evaluations = [eval_policy(policy, minerEnv)] train = False for episode_i in range(0, N_EPISODE): # Reset environment mapID = request_to_env(minerEnv, train) # init environment game minerEnv.reset() #action = policy.select_action(np.array(state)) state = minerEnv.get_state2(int(args.limit_obs)) done = False maxStep = minerEnv.state.mapInfo.maxStep
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'Logs/' + current_time summary_writer = tf.summary.FileWriter(log_dir) #log_dir_2 = 'Logs/check_time_' + current_time #summary_writer_time = tf.summary.FileWriter(log_dir_2) # Initialize a DQN model and a memory batch for storing experiences DQNAgent = DQN(INPUT_SHAPE_1, INPUT_SHAPE_2, ACTION_NUM, epsilon_decay=0.99999, epsilon_min=0.1) DQNAgent.update_target_model() memory = Memory(MEMORY_SIZE) current_memory = Memory(32000) # Initialize environment minerEnv = MinerEnv(HOST, PORT) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease. #Training Process #the main part of the deep-q learning agorithm total_step = 0 loss1 = 0 loss2 = 0 for episode_i in range(0, N_EPISODE): try: # Choosing a map in the list #mapID = np.random.randint(1, 13) # Choosing a map ID from 12 maps in Maps folder randomly mapID = 1 # Choosing a map ID from 12 maps in Maps folder randomly