class StreetFighterEnv(gym.Env): def __init__(self, index, difficulty, monitor=None): roms_path = "roms/" self.env = Environment("env{}".format(index), roms_path, difficulty=difficulty) if monitor: self.monitor = monitor else: self.monitor = None self.env.start() self.action_space = gym.spaces.Discrete(90) self.observation_space = gym.spaces.Box(low=0, high=1, shape=(3 + self.action_space.n, 128, 128), dtype=np.float32) def step(self, action): move_action = action // 10 attack_action = action % 10 frames, reward, round_done, stage_done, game_done = self.env.step( move_action, attack_action) if self.monitor: for frame in frames: self.monitor.record(frame) states = np.zeros(self.observation_space.shape, dtype=np.float32) if not (round_done or stage_done or game_done): states[:3, :] = process_frame(frames[-1]) else: self.env.reset() action = 80 states[action + 3, :] = 1 reward = reward["P1"] / 10 if stage_done: reward += 3 elif game_done: reward -= 5 info = { 'stage_done': stage_done, 'round_done': round_done, 'stage': self.env.stage } return states, reward, game_done, info def reset(self): self.env.new_game() states = np.zeros(self.observation_space.shape, dtype=np.float32) states[80 + 3, :] = 1 return states def __exit__(self, *args): return self.env.close()
def run(worker_no, roms_path, learning_rate, cluster, data_bins, stats, saves_path, save_frequency): name = "worker%d" % worker_no with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % worker_no, cluster=cluster)): Model("global", learning_rate) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="global")) local_model = Model(name, learning_rate) update_local_ops = update_target_graph('global', name) gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) server = tf.train.Server(cluster, job_name="worker", task_index=worker_no, config=config) with tf.train.MonitoredTrainingSession(master=server.target) as sess: try: if stats.get_episode() != 0: saver.restore(sess._sess._sess._sess._sess, f'{saves_path}/model.ckpt') sess.run(update_local_ops) print("Started Worker Updates") env = Environment(name, roms_path, difficulty=3) frames = env.start() while True: history = { "observation": [], "move_action": [], "attack_action": [], "reward": [] } game_done = False total_reward = 0 while not game_done: observation = wu.prepro(frames) history["observation"].append(observation) move_out, attack_out = sess.run( [local_model.move_out_sym, local_model.attack_out_sym], feed_dict={local_model.observation_sym: observation}) move_action_hot = wu.choose_action(move_out) attack_action_hot = wu.choose_action(attack_out) history["move_action"].append(move_action_hot) history["attack_action"].append(attack_action_hot) frames, r, round_done, stage_done, game_done = env.step( np.argmax(move_action_hot), np.argmax(attack_action_hot)) total_reward += r["P1"] history["reward"].append(r["P1"]) if round_done: wu.store_history(data_bins, worker_no, history) history = { "observation": [], "move_action": [], "attack_action": [], "reward": [] } if game_done: wu.train(sess, local_model, *data_bins.empty_bin(worker_no)) sess.run(update_local_ops) stats.update({ "score": total_reward, "stage": env.stage }) if stats.get_episode() > 0 and stats.get_episode( ) % save_frequency == 0: saver.save(sess._sess._sess._sess._sess, f'{saves_path}/model.ckpt') frames = env.reset() except: error = traceback.format_exc() print(error) logging.error(error) exit(1)
class Worker(mp.Process): def __init__(self, env_id, roms_path, difficulty, epoch_size, model, optim, criterion, rewardQueue, frameRatio, framesPerStep): super(Worker, self).__init__() self.env_id = env_id self.roms_path = roms_path self.difficulty = difficulty self.epoch_size = epoch_size self.model = model self.optim = optim self.criterion = criterion self.rewardQueue = rewardQueue self.frameRatio = frameRatio self.framesPerStep = framesPerStep def run(self): try: logger.info("Starting Worker") self.env = Environment(self.env_id, self.roms_path, difficulty=self.difficulty, frame_ratio=self.frameRatio, frames_per_step=self.framesPerStep, throttle=False) frames = self.env.start() while True: self.model.eval() observations, histories, frames = self.generate_playthrough( frames) self.model.train() dataset = wu.compileHistories(observations, histories) wu.train(self.model, self.optim, self.criterion, dataset) except Exception as identifier: logger.error(identifier) logger.error(traceback.format_exc()) def generate_playthrough(self, frames): observations = [[]] histories = [{"moveAction": [], "attackAction": [], "reward": []}] epoch_reward = 0 total_round = 0 game_done = False for i in range(self.epoch_size): while not game_done: x = wu.prepro(frames) observations[total_round].append(x.cpu()) moveOut, attackOut = self.model(Variable(x)) moveAction = wu.chooseAction(F.softmax(moveOut, dim=1)) attackAction = wu.chooseAction(F.softmax(attackOut, dim=1)) histories[total_round]["moveAction"].append( torch.FloatTensor(1).fill_(moveAction)) histories[total_round]["attackAction"].append( torch.FloatTensor(1).fill_(attackAction)) frames, r, round_done, stage_done, game_done = self.env.step( moveAction, attackAction) histories[total_round]["reward"].append( torch.FloatTensor(1).fill_(r["P1"])) epoch_reward += r["P1"] if round_done: total_round += 1 histories.append({ "moveAction": [], "attackAction": [], "reward": [] }) observations.append([]) if game_done: self.rewardQueue.put({ "reward": epoch_reward, "stage": self.env.stage }) frames = self.env.reset() return observations, histories, frames
class MacroStreetFighterEnv(gym.Env): def __init__(self, index, difficulty, monitor=None): roms_path = "roms/" self.env = Environment("env{}".format(index), roms_path, difficulty=difficulty) if monitor: self.monitor = monitor else: self.monitor = None self.env.start() self.action_space = gym.spaces.Discrete(18 + MACRO_NUMS) self.observation_space = gym.spaces.Box(low=0, high=1, shape=(3 + self.action_space.n, 128, 128), dtype=np.float32) def step(self, action): frames, reward, round_done, stage_done, game_done = self.step_(action) if self.monitor: for frame in frames: self.monitor.record(frame) states = np.zeros(self.observation_space.shape, dtype=np.float32) if not (round_done or stage_done or game_done): states[:3, :] = process_frame(frames[-1]) else: self.env.reset() action = 8 states[action + 3, :] = 1 reward = reward["P1"] / 10 if stage_done: reward += 3 elif game_done: reward -= 5 info = { 'stage_done': stage_done, 'round_done': round_done, 'stage': self.env.stage } return states, reward, game_done, info def step_(self, action): if self.env.started: if not self.env.round_done and not self.env.stage_done and not self.env.game_done: if action < 9: actions = index_to_move_action(action) elif action < 18: actions = index_to_attack_action(action - 9) elif action < 18 + MACRO_NUMS: actions = index_to_comb[action - 18]() else: raise EnvironmentError("Action out of range") if action < 18: data = self.env.gather_frames(actions) else: data = self.sub_step_(actions) data = self.env.check_done(data) return data["frame"], data[ "rewards"], self.env.round_done, self.env.stage_done, self.env.game_done else: raise EnvironmentError( "Attempted to step while characters are not fighting") else: raise EnvironmentError("Start must be called before stepping") def sub_step_(self, actions): frames = [] for step in actions: for i in range(step["hold"]): data = self.env.emu.step( [action.value for action in step["actions"]]) frames.append(data['frame']) data = self.env.emu.step([]) frames.append(data['frame']) p1_diff = (self.env.expected_health["P1"] - data["healthP1"]) p2_diff = (self.env.expected_health["P2"] - data["healthP2"]) self.env.expected_health = { "P1": data["healthP1"], "P2": data["healthP2"] } rewards = {"P1": (p2_diff - p1_diff), "P2": (p1_diff - p2_diff)} data["rewards"] = rewards data["frame"] = frames return data def reset(self): self.env.new_game() states = np.zeros(self.observation_space.shape, dtype=np.float32) states[8 + 3, :] = 1 return states def __exit__(self, *args): return self.env.close()