def run(render): net = DeepQNetwork(sess, N_A, N_S, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, scope='dqn_{0}'.format(0), # output_graph=True ) sess.run(tf.global_variables_initializer()) step = 0 for episode in range(300): # initial observation s = env.reset() while True: # RL choose action based on observation a, q = net.choose_action(s) # RL take action and get next observation and reward s_, r, d, _ = env.step(a) if render: env.render() #print('rewards: {0}'.format(r)) net.store_transition(s, a, r, s_) if (step > 200) and (step % 5 == 0): net.learn() # swap observation s = s_ # break while loop when end of this episode if d: break step += 1
e_greedy = 0.9, replace_target_iter = 100, memory_size = 2000, e_greedy_increment = 0.001 ) total_steps = 0 for i_episode in range(100): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # change to a more reasonable reward function x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 RL.store_transition(observation, action, reward, observation_) ep_r += reward if total_steps > 1000: RL.learn()
observation = env.reset() state_.append(observation) # observation = np.identity(16)[observation:observation + 1] # observation = np.expand_dims(observation, axis=2) # observation = np.expand_dims(observation, axis=3) # observation = rgb2gray(observation) score = 0 while True: env.render() action = dqn.choose_action( np.expand_dims(np.array(list(state)), axis=2), True if counter < n_width else False) # action = dqn.choose_action(observation) # action = dqn.choose_action(np.reshape(observation, (1, 3, 1))) f_action = (action - (n_action - 1) / 2) / ((n_action - 1) / 4) observation_, reward, done, info = env.step(np.array([f_action])) reward = reward / 10 # observation_ = np.identity(16)[observation_:observation_ + 1] # observation_ = np.expand_dims(observation_, axis=2) # observation_ = np.expand_dims(observation_, axis=3) # observation_ = rgb2gray(observation_) score += reward
def run(render): nets = [] for i in range(4): net = DeepQNetwork( sess, N_A, N_S, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, scope='dqn_{0}'.format(i), # output_graph=True ) nets.append(net) sess.run(tf.global_variables_initializer()) step = 0 for episode in range(300): # initial observation s = env.reset() while True: # fresh env if render: env.render() # RL choose action based on observation sum = np.zeros(4, dtype=float) count = np.zeros(4, dtype=float) for i in range(len(nets)): net = nets[i] a, q = net.choose_action(s) sum[a] += W[i] * q count[a] += 1 sum[count > 0] = sum[count > 0] / count[count > 0] a = np.argmax(sum) print('mean: {0}'.format(sum)) # RL take action and get next observation and reward s_, r, d, _ = env.step(a) print('rewards: {0}'.format(r)) for i in range(len(nets)): net = nets[i] net.store_transition(s, a, r[i], s_) if (step > 200) and (step % 5 == 0): for net in nets: net.learn() # swap observation s = s_ # break while loop when end of this episode if d: break step += 1
episodes = 20 dqn = DeepQNetwork(env.action_space.n, episodes=episodes, observation_space=env.observation_space.n) episode_reward_np_array = np.zeros(episodes) random_probability = 1 for episode in range(episodes): done = False episode_reward = 0 current_state = env.reset() step_counter = 0 while not done: dqn.set_random_probability(random_probability) # os.system("clear") action = dqn.choose_action(current_state) next_state, reward, done, info = env.step(action) dqn.store_experience(current_state, action, reward, next_state, done) if step_counter > 100 and step_counter % 5 == 0: dqn.learn() current_state = next_state episode_reward += reward step_counter += 1 # env.render() # time.sleep(0.1)
class trainer(): def __init__(self, station_history): # Session Properties self.episodes = [] self.stock_type = "" self.logging = False self.env_debug = False self.rl_debug = False self.bike_station = None self.operator = None self.sim_stock = [] self.model_based = False self.ID = None self.method = None self.station_history = station_history # Performance Metric self.success_ratio = 0 self.rewards = [] # [[r from session 1], [r from session 2] ...] self.avg_rewards = [ ] #[np.mean([r from session 1]), np.mean([r from session 2])...] self.final_stocks = [ ] # [[stock from session 1], [stock from session 2] ...] self.episode_action_history = [] self.episode_stock_history = [] self.session_action_history = [] self.session_stock_history = [] self.q_tables = [] self.actions = [-10, -3, -1, 0] def start(self, episodes, stock_type, logging, env_debug, rl_debug, brain, ID, model_based): #brain: which method to use. Q learning vs DQN self.episodes = episodes self.stock_type = stock_type self.logging = logging self.env_debug = env_debug self.rl_debug = rl_debug self.brain = brain self.ID = ID self.model_based = model_based if brain == 'q' and model_based == False: self.method = 'QLN' elif brain == 'q' and model_based == True: self.method = 'FCT' else: self.method = 'DQN' idx = 0 for eps in self.episodes: # Initiate new evironment and RL agent self.bike_station = env(self.stock_type, debug=self.env_debug, ID=self.ID, station_history=self.station_history) self.sim_stock.append(self.bike_station.get_sim_stock()) if self.brain == 'q': self.operator = agent( epsilon=0.9, lr=0.01, gamma=0.9, current_stock=self.bike_station.current_stock(), debug=self.rl_debug, expected_stock=self.bike_station.get_expected_stock(), model_based=model_based) elif self.brain == 'dqn': self.operator = DeepQNetwork(self.bike_station.n_actions, self.bike_station.n_features, 0.01, 0.9) else: print("Error: pick correct brain") break # Train the RL agent and collect performance stats rewards, final_stocks = self.train_operator( idx, len(self.episodes), eps, logging=self.logging, brain=self.brain, model_based=self.model_based) # Log the results from this training session self.rewards.append(rewards) self.avg_rewards.append(np.mean(rewards)) self.final_stocks.append(final_stocks) #self.q_tables.append(self.operator.get_q_table()) self.session_action_history.append(self.episode_action_history) self.session_stock_history.append(self.episode_stock_history) self.reset_episode_history() # Destroy the environment and agent objects self.bike_station = None self.operator = None idx += 1 if logging == True: if self.brain == 'q': self.save_session_results(self.get_timestamp(replace=True)) else: self.save_session_results_dqn(self.get_timestamp(replace=True)) return def train_operator(self, idx, num_sessions, episodes, logging, brain, model_based): ''' This function trains an RL agent by interacting with the bike station environment. It also tracks and reports performance stats. Input: - episodes: a int of episode to be trained in this session (e.g. 500) Output: - reward_list: a list of reward per episode in this sesison - final_stocks: a list of final stocks per episode in this session ''' print("Start training the Agent ...") rewards = 0 reward_list = [] final_stocks = [] step = 0 for eps in range(episodes): self.bike_station.reset() while True: # Agent picks an action (number of bikes to move) # Agent sends the action to bike station environment # Agent gets feedback from the environment (e.g. reward of the action, new bike stock after the action, etc.) # Agent "learn" the feedback by updating its Q-Table (state, action, reward) # Repeat until end of day (23 hours) # Reset bike station environment to start a new day, repeat all if self.brain == 'q': action = self.operator.choose_action( self.bike_station.get_old_stock(), self.bike_station.get_expected_stock()) current_hour, old_stock, new_stock, expected_stock, _, reward, done, game_over = self.bike_station.ping( action) else: action = self.operator.choose_action( self.bike_station.get_old_stock()) current_hour, old_stock, new_stock, reward, done = self.bike_station.ping_dqn( action) self.operator.store_transition(old_stock, action, reward, new_stock) if step > 50 and (step % 10 == 0): self.operator.learn() #observation_, reward, done = self.bike_station.ping(action) if done == True: print( "{} of {} Session | Episode: {} | Final Stock: {} |Final Reward: {:.2f}" .format(idx, num_sessions, eps, old_stock, rewards)) reward_list.append(rewards) final_stocks.append(old_stock) rewards = 0 # Log session action history by episode if brain == 'q': self.episode_action_history.append( self.operator.get_hourly_actions()) self.episode_stock_history.append( self.operator.get_hourly_stocks()) self.operator.reset_hourly_history() else: self.episode_stock_history.append( self.operator.get_hourly_stocks()) self.operator.reset_hourly_history() break if brain == 'q': self.operator.learn(old_stock, action, reward, new_stock, expected_stock, game_over) step += 1 rewards += reward # Log hourly action history by each episode with open('dqn_log.txt', 'a') as f: f.write( "{} of {} Session | Episode: {} | Final Stock: {} |Final Reward: {:.2f} \n" .format(idx, num_sessions, eps, old_stock, rewards)) return reward_list, final_stocks def get_timestamp(self, replace): if replace == True: return str(datetime.datetime.now()).replace(" ", "").replace(":", "").\ replace(".", "").replace("-", "") else: return str(datetime.datetime.now()) def reset_episode_history(self): self.episode_action_history = [] self.episode_stock_history = [] def cal_performance(self): successful_stocking = [] print("===== Performance =====") for session in range(len(self.final_stocks)): length = len(self.final_stocks[session]) num_overstock = np.count_nonzero( np.array(self.final_stocks[session]) > 50) num_understock = np.count_nonzero( np.array(self.final_stocks[session]) <= 0) ratio = (length - num_understock - num_overstock) * 100 / length print( "Session {} | Overstock {} Times | Understock {} Times | {}% Successful" .format(session, num_overstock, num_understock, ratio)) average_reward = round(self.avg_rewards[session], 2) print("Average Episode Reward for Session: {}".format( average_reward)) successful_stocking.append(ratio) return successful_stocking def save_session_results(self, timestamp): ''' This function logs the following: - overall success ratio of each session - line chart of success ratio by session - line chart of reward history by session - Q Table of each session - Comparison Line Chart of First and Last Episode Hourly Actions ''' # --- create a session folder --- dir_path = "./performance_log/" + timestamp if not os.path.exists(dir_path): os.makedirs(dir_path) successful_stocking = self.cal_performance() # --- Write Success Rate to File --- fname = dir_path + "/success_rate - " + timestamp + ".txt" with open(fname, 'w') as f: f.write("Logged at {}".format(self.get_timestamp(replace=False))) f.write("\n") f.write("This training session ran episodes: {}".format( self.episodes)) f.write("\n") for session in range(len(successful_stocking)): f.write( "Session {} | Episodes: {} | Success Rate: {:.2f}%".format( session, self.episodes[session], successful_stocking[session])) f.write("\n") # --- Plot Overall Success Rate by Episode --- title = "% of Successful Rebalancing - " + timestamp fig1 = plt.figure() plt.plot(self.episodes, successful_stocking) plt.xlabel("Episodes") plt.ylabel("% Success Rate") plt.title(title) fig1.savefig(dir_path + "/session_success_rate_" + timestamp) # --- Plot Reward History by Training Session --- for session in range(len(self.rewards)): fig = plt.figure(figsize=(10, 8)) title = "Reward History by Training Session " + str( session) + " - " + timestamp x_axis = [x for x in range(self.episodes[session])] plt.plot(x_axis, self.rewards[session], label="Session " + str(session)) plt.legend() plt.xlabel("Episode") plt.ylabel("Reward") plt.title(title) fig.savefig(dir_path + "/reward_history_session_" + \ str(session) + timestamp) # --- Plot Average Reward History by Training Session --- figR = plt.figure(figsize=[10, 8]) lengths = [len(r) for r in self.rewards] means = [np.mean(r) for r in self.rewards] if len(self.rewards) > 1: increment = (lengths[1] - lengths[0]) / 20 else: increment = lengths[0] / 20 for reward_list in self.rewards: Q3 = np.percentile(reward_list, 75) Q1 = np.percentile(reward_list, 25) M = np.mean(reward_list) location = len(reward_list) plt.plot([location - increment, location + increment], [Q1, Q1], 'k-') plt.plot([location - increment, location + increment], [Q3, Q3], 'k-') plt.plot([location, location], [Q1, Q3], 'k-') plt.scatter(location, M, s=100, color='dodgerblue') plt.xlabel('Number of Episodes in Session') plt.ylabel('Average Reward per Episode') plt.title('Average Reward vs. Session Size', size=20) plt.xticks(lengths) plt.plot(lengths, means, linestyle='--') figR.savefig(dir_path + "/reward_averages") # --- Save Q tables --- for session in range(len(self.q_tables)): self.q_tables[session].to_csv(dir_path + "/q_table_session_" + \ str(session) + timestamp + ".csv") # --- Comparison Line Chart of First and Last Episode for each Session --- file_path = dir_path + "/action_history" if not os.path.exists(file_path): os.makedirs(file_path) for session in range(len(self.session_action_history)): first_eps_idx = 0 last_eps_idx = len(self.session_action_history[session]) - 1 fig = plt.figure(figsize=(10, 8)) title = "Session " + str( session) + " - Hourly Action of Eps " + str( first_eps_idx) + " and Eps " + str(last_eps_idx) x_axis = [ x for x in range(len(self.session_action_history[session][0])) ] plt.plot(x_axis, self.session_action_history[session][0], label="Eps 0") plt.plot(x_axis, self.session_action_history[session][-1], label="Eps " + str(last_eps_idx)) plt.legend() plt.xlabel("Hours") plt.ylabel("Number of Bikes Moved") plt.title(title) fig.savefig(file_path + "/action_history_" + str(session) + timestamp) # --- Comparison Line Chart of Simulated and Rebalanced Bike Stock --- # file_path = dir_path + "/stock_history" if not os.path.exists(file_path): os.makedirs(file_path) for session in range(len(self.session_stock_history)): first_eps_idx = 0 last_eps_idx = len(self.session_action_history[session]) - 1 fig = plt.figure(figsize=(10, 8)) title = "[" + self.method + "]" + "Session " + str( session) + " - Original vs. Balanced Bike Stock after " + str( first_eps_idx) + " and Eps " + str(last_eps_idx) x_axis = [ x for x in range(len(self.session_stock_history[session][0])) ] plt.plot(x_axis, self.sim_stock[session], label="Original without Balancing") plt.plot(x_axis, self.session_stock_history[session][0], label="Balanced Bike Stock - Eps 0") plt.plot(x_axis, self.session_stock_history[session][-1], label="Balanced Bike Stock - Eps " + str(last_eps_idx)) plt.axhline(y=50, c="r", ls="--", label="Upper Stock Limit") plt.axhline(y=0, c="r", ls="--", label="Lower Stock Limit") plt.legend() plt.xlabel("Hours") plt.ylabel("Number of Bike Stock") plt.title(title) fig.savefig(file_path + "/stock_history_" + str(session) + timestamp) return def save_session_results_dqn(self, timestamp): dir_path = "./performance_log/" + timestamp if not os.path.exists(dir_path): os.makedirs(dir_path) # --- Comparison Line Chart of Simulated and Rebalaned Bike Stock --- # file_path = dir_path + "/stock_history" if not os.path.exists(file_path): os.makedirs(file_path) successful_stocking = self.cal_performance() # --- Write Success Rate to File --- fname = dir_path + "/success_rate - " + timestamp + ".txt" with open(fname, 'w') as f: f.write("Logged at {}".format(self.get_timestamp(replace=False))) f.write("\n") f.write("This training session ran episodes: {}".format( self.episodes)) f.write("\n") for session in range(len(successful_stocking)): f.write( "Session {} | Episodes: {} | Success Rate: {:.2f}%".format( session, self.episodes[session], successful_stocking[session])) f.write("\n") # --- Plot Overall Success Rate by Episode --- title = "% of Successful Rebalancing - " + timestamp fig1 = plt.figure() plt.plot(self.episodes, successful_stocking) plt.xlabel("Episodes") plt.ylabel("% Success Rate") plt.title(title) fig1.savefig(dir_path + "/session_success_rate_" + timestamp) for session in range(len(self.session_stock_history)): first_eps_idx = 0 last_eps_idx = len(self.session_stock_history[session]) - 1 fig = plt.figure(figsize=(10, 8)) title = "[" + self.method + "]" + " Session " + str( session) + " - Original vs. Balanced Bike Stock after " + str( first_eps_idx) + " and Eps " + str(last_eps_idx) x_axis = [ x for x in range(len(self.session_stock_history[session][0])) ] plt.plot(x_axis, self.sim_stock[session], label="Original without Balancing") plt.plot(x_axis, self.session_stock_history[session][0], label="Balanced Bike Stock - Eps 0") plt.plot(x_axis, self.session_stock_history[session][-1], label="Balanced Bike Stock - Eps " + str(last_eps_idx)) plt.axhline(y=50, c="r", ls="--", label="Upper Stock Limit") plt.axhline(y=0, c="r", ls="--", label="Lower Stock Limit") plt.legend() plt.xlabel("Hours") plt.ylabel("Number of Bike Stock") plt.title(title) fig.savefig(file_path + "/stock_history_" + "DQN" + str(session) + timestamp) return
agent.sess, r"saved model/mountain car/dqn/mountain car_dqn.ckpt") # if dont want to train,set this # agent.learn_threshold = 1e8 steps = [] for i_episode in range(20): total_steps = 0 observation = env.reset() while True: # if i_episode > 10: # env.render() # false means act deterministicly action = agent.choose_action(observation, True) observation_, reward, done, info = env.step(action) if done: reward = 10 agent.store_transition(observation, action, reward, observation_, done) if done: print('episode', i_episode, total_steps) steps.append(total_steps) break observation = observation_ total_steps += 1
class SmartAgent(object): def __init__(self): # from the origin base.agent self.reward = 0 self.episodes = 0 self.steps = 0 self.obs_spec = None self.action_spec = None self.dqn = DeepQNetwork( len(smart_actions), 10, # one of the most important data that needs to be update manually learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=5000, batch_size=32, e_greedy_increment=None, output_graph=True) # self defined vars self.fighting = False self.player_hp = [] self.enemy_hp = [] self.previous_enemy_hp = [] self.previous_player_hp = [] self.leftover_enemy_hp = [] self.win = 0 self.count = 0 self.previous_action = None self.previous_state = None def step(self, obs): # from the origin base.agent self.steps += 1 self.reward += obs.reward current_state, enemy_hp, player_hp, enemy_loc, player_loc, distance, selected, enemy_count, player_count, player_cooldown = self.extract_features( obs) self.player_hp.append(sum(player_hp)) self.enemy_hp.append(sum(enemy_hp)) # scripted the few initial actions to increases the learning performance while not self.fighting: for i in range(0, player_count): if distance[i] < 20: self.fighting = True # return actions.FunctionCall(_NO_OP, []) return actions.FunctionCall(_ATTACK_SCREEN, [_NOT_QUEUED, enemy_loc[0]]) # Default case => Select unit # select the unit that is closest to the enemy # if same distance, pick the one with lower hp # if same distance and hp, randomly select one closest_indices = [] closest_index = distance.index(min(distance)) for i in range(0, player_count): if distance[i] == distance[closest_index]: closest_indices.append(i) lowest_hp_indices = [] lowest_hp_index = player_hp.index(min(player_hp)) for i in range(0, player_count): if player_hp[i] == player_hp[lowest_hp_index]: lowest_hp_indices.append(i) common_indices = list( set(closest_indices).intersection(lowest_hp_indices)) if len(common_indices) != 0: selected_index = random.choice(common_indices) elif len(closest_indices) != 0: selected_index = random.choice(closest_indices) else: selected_index = 0 if selected[selected_index] == 0 or (selected[0] == 1 and selected[1] == 1): return actions.FunctionCall( _SELECT_POINT, [_NOT_QUEUED, player_loc[selected_index]]) rl_action = self.dqn.choose_action(np.array(current_state)) smart_action = smart_actions[rl_action] # record the transitions to memory and learn by DQN if self.previous_action is not None: reward = self.get_reward(obs, distance, player_hp, enemy_hp, player_count, enemy_count, rl_action, selected, player_loc, enemy_loc, player_cooldown) self.dqn.store_transition(np.array(self.previous_state), self.previous_action, reward, np.array(current_state)) self.previous_state = current_state self.previous_action = rl_action self.previous_enemy_hp = enemy_hp self.previous_player_hp = player_hp next_action = self.perform_action(obs, smart_action, player_loc, enemy_loc, selected, player_count, enemy_count, distance, player_hp) return next_action def get_reward(self, obs, distance, player_hp, enemy_hp, player_count, enemy_count, rl_action, selected, unit_locs, enemy_loc, player_cooldown): reward = 0. selected_index = -1 for i in range(0, DEFAULT_PLAYER_COUNT): if selected[i] == 1: selected_index = i x = unit_locs[selected_index][0] y = unit_locs[selected_index][1] if distance[selected_index] < 6 or distance[selected_index] > 20: reward -= 1 else: reward = distance[selected_index] / 20 return reward # extract all the desired features as inputs for the DQN def extract_features(self, obs): var = obs.observation['feature_units'] # get units' location and distance enemy, player = [], [] # get health enemy_hp, player_hp, player_cooldown = [], [], [] # record the selected army is_selected = [] # unit_count enemy_unit_count, player_unit_count = 0, 0 for i in range(0, var.shape[0]): if var[i][_UNIT_ALLIANCE] == _PLAYER_HOSTILE: enemy.append((var[i][_UNIT_X], var[i][_UNIT_Y])) enemy_hp.append(var[i][_UNIT_HEALTH] + var[i][_UNIT_SHIELD]) enemy_unit_count += 1 else: player.append((var[i][_UNIT_X], var[i][_UNIT_Y])) player_hp.append(var[i][_UNIT_HEALTH]) is_selected.append(var[i][_UNIT_IS_SELECTED]) player_cooldown.append((var[i][_UNIT_COOLDOWN])) player_unit_count += 1 # append if necessary so that maintains fixed length for current state for i in range(player_unit_count, DEFAULT_PLAYER_COUNT): player.append((-1, -1)) player_hp.append(0) player_cooldown.append(0) is_selected.append(-1) for i in range(enemy_unit_count, DEFAULT_ENEMY_COUNT): enemy.append((-1, -1)) enemy_hp.append(0) # get distance min_distance = [100000 for x in range(DEFAULT_PLAYER_COUNT)] for i in range(0, player_unit_count): for j in range(0, enemy_unit_count): distance = int( math.sqrt((player[i][0] - enemy[j][0])**2 + (player[i][1] - enemy[j][1])**2)) if distance < min_distance[i]: min_distance[i] = distance # some new stuff to try player_units, enemy_units = [], [] for i in range(0, var.shape[0]): if var[i][_UNIT_ALLIANCE] == _PLAYER_HOSTILE: unit = [] unit.append(var[i][_UNIT_X]) unit.append(var[i][_UNIT_Y]) unit.append(var[i][_UNIT_HEALTH] + var[i][_UNIT_SHIELD]) unit.append(var[i][_UNIT_COOLDOWN]) enemy_units.append(unit) else: unit = [] unit.append(var[i][_UNIT_X]) unit.append(var[i][_UNIT_Y]) unit.append(var[i][_UNIT_HEALTH]) unit.append(var[i][_UNIT_COOLDOWN]) unit.append(100000) # default distance unit.append(var[i][_UNIT_IS_SELECTED]) if var[i][_UNIT_IS_SELECTED] == 1: player_units.append(unit) if var[i][_UNIT_HEALTH] < 20: self.count += 1 # append if necessary so that maintains fixed length for current state for i in range(player_unit_count, 1): unit = [-1, -1, 0, 0, 100000, 0] player_units.append(unit) for i in range(enemy_unit_count, DEFAULT_ENEMY_COUNT): unit = [-1, -1, 0, 0] enemy_units.append(unit) for unit in player_units: for opponent in enemy_units: distance = int( math.sqrt((unit[0] - opponent[0])**2 + (unit[1] - opponent[1])**2)) if distance < unit[4]: unit[4] = distance # flatten the array so that all features are a 1D array feature1 = np.array(enemy_hp).flatten() # enemy's hp feature2 = np.array(player_hp).flatten() # player's hp feature3 = np.array(enemy).flatten() # enemy's coordinates feature4 = np.array(player).flatten() # player's coordinates feature5 = np.array(min_distance).flatten() # distance feature6 = np.array(player_cooldown).flatten() feature7 = np.array(player_units).flatten() feature8 = np.array(enemy_units).flatten() # combine all features horizontally #current_state = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6)) current_state = np.hstack((feature7, feature8)) return current_state, enemy_hp, player_hp, enemy, player, min_distance, is_selected, enemy_unit_count, player_unit_count, player_cooldown # make the desired action calculated by DQN def perform_action(self, obs, action, unit_locs, enemy_locs, selected, player_count, enemy_count, distance, player_hp): index = -1 for i in range(0, DEFAULT_PLAYER_COUNT): if selected[i] == 1: index = i x = unit_locs[index][0] y = unit_locs[index][1] if action == ATTACK_TARGET: if _ATTACK_SCREEN in obs.observation["available_actions"]: if enemy_count >= 1: return actions.FunctionCall( _ATTACK_SCREEN, [_NOT_QUEUED, enemy_locs[0]]) # x,y => col,row elif action == MOVE_UP: if _MOVE_SCREEN in obs.observation[ "available_actions"] and index != -1: x = x y = y - 4 if 3 > x: x = 3 elif x > 79: x = 79 if 3 > y: y = 3 elif y > 59: y = 59 return actions.FunctionCall( _MOVE_SCREEN, [_NOT_QUEUED, [x, y]]) # x,y => col,row elif action == MOVE_DOWN: if _MOVE_SCREEN in obs.observation[ "available_actions"] and index != -1: x = x y = y + 4 if 3 > x: x = 3 elif x > 79: x = 79 if 3 > y: y = 3 elif y > 59: y = 59 return actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [x, y]]) elif action == MOVE_LEFT: if _MOVE_SCREEN in obs.observation[ "available_actions"] and index != -1: x = x - 4 y = y if 3 > x: x = 3 elif x > 79: x = 79 if 3 > y: y = 3 elif y > 59: y = 59 return actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [x, y]]) elif action == MOVE_RIGHT: if _MOVE_SCREEN in obs.observation[ "available_actions"] and index != -1: x = x + 4 y = y if 3 > x: x = 3 elif x > 79: x = 79 if 3 > y: y = 3 elif y > 59: y = 59 return actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [x, y]]) return actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [x, y]]) def plot_hp(self, path, save): plt.plot(np.arange(len(self.player_hp)), self.player_hp) plt.ylabel('player hp') plt.xlabel('training steps') if save: plt.savefig(path + '/player_hp.png') plt.close() plt.plot(np.arange(len(self.enemy_hp)), self.enemy_hp) plt.ylabel('enemy hp') plt.xlabel('training steps') if save: plt.savefig(path + '/enemy_hp.png') plt.close() plt.plot(np.arange(len(self.leftover_enemy_hp)), self.leftover_enemy_hp) plt.ylabel('enemy hp') plt.xlabel('Episodes') if save: plt.savefig(path + '/eval.png') plt.close() print("AVG ENEMY HP LEFT", sum(self.leftover_enemy_hp) / len(self.leftover_enemy_hp)) print("Winning Rate: {0:.2f}%".format( float(self.win / (self.episodes - 1) * 100))) print("Low hp controlled steps", self.count) # from the origin base.agent def setup(self, obs_spec, action_spec): self.obs_spec = obs_spec self.action_spec = action_spec # from the origin base.agent def reset(self): self.episodes += 1 # added instead of original self.fighting = False if self.episodes > 1: self.leftover_enemy_hp.append(sum(self.previous_enemy_hp)) if sum(self.previous_enemy_hp) == 0: self.win += 1 self.dqn.learn()
class SmartAgent(base_agent.BaseAgent): def __init__(self): self.dqn = DeepQNetwork(n_actions=524, n_features=13) self.previous_action = None self.previous_state = None self.episodes = 0 self.steps = 0 self.reward = 0 self.reward_weights = np.array([ .2, ##blizz_score .2, .2, ##total_unit_value, total_structure_value .2, .3, ##killed_unit_value, killed_building_value .2, .2, ##mineral_rate, mineral_spent .2, .1, ##supply_used, supply_limit .3, .3, ##army_supply,worker_supply .3 #army_count ]) def transformLocation(self, x, x_distance, y, y_distance): ## Revisit how this is evaluated if not self.base_top_left: return [x - x_distance, y - y_distance] return [x + x_distance, y + y_distance] def step(self, obs): super(SmartAgent, self).step(obs) blizz_score = obs.observation['score_cumulative'][0] total_unit_value = obs.observation['score_cumulative'][3] total_structure_value = obs.observation['score_cumulative'][4] killed_unit_value = obs.observation['score_cumulative'][5] killed_building_value = obs.observation['score_cumulative'][6] mineral_rate = obs.observation['score_cumulative'][9] mineral_spent = obs.observation['score_cumulative'][11] mineral_count = obs.observation['player'][1] ##7th supply_used = obs.observation['player'][3] supply_limit = obs.observation['player'][4] army_supply = obs.observation['player'][5] worker_supply = obs.observation['player'][6] army_count = obs.observation['player'][8] ## This should also take feature layers current_state = np.array([ blizz_score, total_unit_value, total_structure_value, killed_unit_value, killed_building_value, mineral_rate, mineral_spent, mineral_count, supply_used, supply_limit, army_supply, worker_supply, army_count ]) ## New state? 0 or 1 based on position? ## Choose action rl_action = self.dqn.choose_action( current_state, list(obs.observation['available_actions'])) reward = 0 if self.steps > 1: reward = np.delete(current_state, 7) - np.delete( self.previous_state, 7) reward = (reward > 0).astype(int) reward = np.sum(np.dot(reward, self.reward_weights)) #print reward ## Store transition self.dqn.store_transition(self.previous_state, self.previous_action, reward, current_state) ## Learn self.dqn.learn() self.previous_state = current_state self.previous_action = rl_action args = [[np.random.randint(0, size) for size in arg.sizes] for arg in self.action_spec.functions[rl_action].args] return actions.FunctionCall(rl_action, args)
if i < show_ti: print("end one episode") if r_sum > 0: win_time += 1 print("test done~~~") print(win_time) for episode in range(EP_MAX): # initial observation s = transform(board.random_start()) r_sum = 0.0 while True: # RL choose action based on observation action = dqn.choose_action(s, option(s), GLOBAL_N) # print(action) #if good() or bad(): # print(s[0:GLOBAL_M], action/GLOBAL_N, action%GLOBAL_N) # RL take action and get next observation and reward if MODE == "random": reward, s_, done = board.move(action / GLOBAL_N, action % GLOBAL_N) else: board.decide(action / GLOBAL_N, action % GLOBAL_N) action_space = dqn.rival(transform(board.get_board())) reward, s_, done = board.rival(action_space) r_sum += reward step += 1 s_ = transform(s_) dqn.store_transition(s, action, reward, s_)