class StreetFighterEnv(gym.Env): def __init__(self, index, difficulty, monitor=None): roms_path = "roms/" self.env = Environment("env{}".format(index), roms_path, difficulty=difficulty) if monitor: self.monitor = monitor else: self.monitor = None self.env.start() self.action_space = gym.spaces.Discrete(90) self.observation_space = gym.spaces.Box(low=0, high=1, shape=(3 + self.action_space.n, 128, 128), dtype=np.float32) def step(self, action): move_action = action // 10 attack_action = action % 10 frames, reward, round_done, stage_done, game_done = self.env.step( move_action, attack_action) if self.monitor: for frame in frames: self.monitor.record(frame) states = np.zeros(self.observation_space.shape, dtype=np.float32) if not (round_done or stage_done or game_done): states[:3, :] = process_frame(frames[-1]) else: self.env.reset() action = 80 states[action + 3, :] = 1 reward = reward["P1"] / 10 if stage_done: reward += 3 elif game_done: reward -= 5 info = { 'stage_done': stage_done, 'round_done': round_done, 'stage': self.env.stage } return states, reward, game_done, info def reset(self): self.env.new_game() states = np.zeros(self.observation_space.shape, dtype=np.float32) states[80 + 3, :] = 1 return states def __exit__(self, *args): return self.env.close()
def run_env(worker_id, roms_path): env = Environment(f"env{worker_id}", roms_path) env.start() while True: move_action = random.randint(0, 8) attack_action = random.randint(0, 9) frames, reward, round_done, stage_done, game_done = env.step( move_action, attack_action) if game_done: env.new_game() elif stage_done: env.next_stage() elif round_done: env.next_round()
class StreetFighterEnv(object): def __init__(self, index, monitor=None): roms_path = "/home/zhangchao/Downloads/" self.env = Environment("env{}".format(index), roms_path) if monitor: self.monitor = monitor else: self.monitor = None self.env.start() # 单步执行返回数据:游戏画面,一轮结束,场景切换,游戏结束 def step(self, action): move_action = action // 10 attack_action = action % 10 frames, reward, round_done, stage_done, game_done = self.env.step( move_action, attack_action) if self.monitor: for frame in frames: # 通过管道pipe为子进程写入图像数据 self.monitor.record(frame) if not (round_done or stage_done or game_done): # frame拼接 frames = np.concatenate([process_frame(frame) for frame in frames], 0)[None, :, :, :].astype(np.float32) else: frames = np.zeros((1, 3, 168, 168), dtype=np.float32) # 奖励设计 reward = reward["P1"] if stage_done: reward = 25 elif game_done: reward = -50 reward *= (1 + (self.env.stage - 1) / 10) reward /= 10 return frames, reward, round_done, stage_done, game_done # 重启 def reset(self, round_done, stage_done, game_done): if game_done: self.env.new_game() elif stage_done: self.env.next_stage() elif round_done: self.env.next_round() return np.zeros((1, 3, 168, 168), dtype=np.float32)
def run(worker_no, roms_path, learning_rate, cluster, data_bins, stats, saves_path, save_frequency): name = "worker%d" % worker_no with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % worker_no, cluster=cluster)): Model("global", learning_rate) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="global")) local_model = Model(name, learning_rate) update_local_ops = update_target_graph('global', name) gpu_options = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) server = tf.train.Server(cluster, job_name="worker", task_index=worker_no, config=config) with tf.train.MonitoredTrainingSession(master=server.target) as sess: try: if stats.get_episode() != 0: saver.restore(sess._sess._sess._sess._sess, f'{saves_path}/model.ckpt') sess.run(update_local_ops) print("Started Worker Updates") env = Environment(name, roms_path, difficulty=3) frames = env.start() while True: history = { "observation": [], "move_action": [], "attack_action": [], "reward": [] } game_done = False total_reward = 0 while not game_done: observation = wu.prepro(frames) history["observation"].append(observation) move_out, attack_out = sess.run( [local_model.move_out_sym, local_model.attack_out_sym], feed_dict={local_model.observation_sym: observation}) move_action_hot = wu.choose_action(move_out) attack_action_hot = wu.choose_action(attack_out) history["move_action"].append(move_action_hot) history["attack_action"].append(attack_action_hot) frames, r, round_done, stage_done, game_done = env.step( np.argmax(move_action_hot), np.argmax(attack_action_hot)) total_reward += r["P1"] history["reward"].append(r["P1"]) if round_done: wu.store_history(data_bins, worker_no, history) history = { "observation": [], "move_action": [], "attack_action": [], "reward": [] } if game_done: wu.train(sess, local_model, *data_bins.empty_bin(worker_no)) sess.run(update_local_ops) stats.update({ "score": total_reward, "stage": env.stage }) if stats.get_episode() > 0 and stats.get_episode( ) % save_frequency == 0: saver.save(sess._sess._sess._sess._sess, f'{saves_path}/model.ckpt') frames = env.reset() except: error = traceback.format_exc() print(error) logging.error(error) exit(1)
class Worker(mp.Process): def __init__(self, env_id, roms_path, difficulty, epoch_size, model, optim, criterion, rewardQueue, frameRatio, framesPerStep): super(Worker, self).__init__() self.env_id = env_id self.roms_path = roms_path self.difficulty = difficulty self.epoch_size = epoch_size self.model = model self.optim = optim self.criterion = criterion self.rewardQueue = rewardQueue self.frameRatio = frameRatio self.framesPerStep = framesPerStep def run(self): try: logger.info("Starting Worker") self.env = Environment(self.env_id, self.roms_path, difficulty=self.difficulty, frame_ratio=self.frameRatio, frames_per_step=self.framesPerStep, throttle=False) frames = self.env.start() while True: self.model.eval() observations, histories, frames = self.generate_playthrough( frames) self.model.train() dataset = wu.compileHistories(observations, histories) wu.train(self.model, self.optim, self.criterion, dataset) except Exception as identifier: logger.error(identifier) logger.error(traceback.format_exc()) def generate_playthrough(self, frames): observations = [[]] histories = [{"moveAction": [], "attackAction": [], "reward": []}] epoch_reward = 0 total_round = 0 game_done = False for i in range(self.epoch_size): while not game_done: x = wu.prepro(frames) observations[total_round].append(x.cpu()) moveOut, attackOut = self.model(Variable(x)) moveAction = wu.chooseAction(F.softmax(moveOut, dim=1)) attackAction = wu.chooseAction(F.softmax(attackOut, dim=1)) histories[total_round]["moveAction"].append( torch.FloatTensor(1).fill_(moveAction)) histories[total_round]["attackAction"].append( torch.FloatTensor(1).fill_(attackAction)) frames, r, round_done, stage_done, game_done = self.env.step( moveAction, attackAction) histories[total_round]["reward"].append( torch.FloatTensor(1).fill_(r["P1"])) epoch_reward += r["P1"] if round_done: total_round += 1 histories.append({ "moveAction": [], "attackAction": [], "reward": [] }) observations.append([]) if game_done: self.rewardQueue.put({ "reward": epoch_reward, "stage": self.env.stage }) frames = self.env.reset() return observations, histories, frames
import random from MAMEToolkit.sf_environment import Environment roms_path = 'rom/' env = Environment("sfiii3n", roms_path) env.start() while True: move_action = random.randint(0, 8) attack_action = random.randint(0, 9) frames, reward, round_done, stage_done, game_done = env.step( move_action, attack_action) if game_done: env.new_game() elif stage_done: env.next_stage() elif round_done: env.next_round()
import random import time from MAMEToolkit.sf_environment import Environment roms_path = 'roms/' # Replace this with the path to your ROMs env = Environment('env1', roms_path) print('[random] Loaded SF3 from ROM file') # https://github.com/M-J-Murray/MAMEToolkit/blob/3041734391292376aa909938ea5b51030e3c0240/MAMEToolkit/sf_environment/Environment.py#L88 print('[random] Wait until learnable gameplay starts...') frames = env.start() print('[random] Start!') game_done = False while True: move_action = random.randint(0, 8) attack_action = random.randint(0, 9) # frames, reward, round_done, stage_done, game_done = env.step(move_action, attack_action) frames, reward, round_done, stage_done, game_done = env.step(0, 0) if game_done: break elif stage_done: print('[random] Stage finished: retrieving next state.') env.next_stage() elif round_done: print('[random] Round finished: retrieving next round.') env.next_round() print('[random] Game finished: closing environment') env.close()