def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def test(): env = envWrapper(SubprocVecEnv(sonic.make_envs(num=1))) model = learn(env, 20, 0, 1e6) total_reward = 0.0 test_env = sonic.make_env() for i in range(1): obs = env.reset() while True: action_index, _, _, _ = model.eval_and_sample(torch.tensor(obs, dtype=torch.float).to(device)) # need to unsqueeze eval output obs, reward, done = env.step(action_index) total_reward += np.sum(reward) if done.any(): break print("{} testgames done".format(i + 1)) total_reward_rand = 0 for i in range(1): obs = env.reset() while True: obs, reward, done = env.step([env.env.action_space.sample() for i in range(env.num_envs)]) total_reward += np.sum(reward) if done.any(): break print("{} testgames done".format(i + 1)) print("total_reward: {}".format(total_reward)) print("total_reward_rand: {}".format(total_reward_rand))
def main(): """Run DQN until the environment throws an exception.""" # env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1') # env = SonicDiscretizer(env) # env = WarpFrame(env) # env = AllowBacktracking(env) env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.7, 0.6, epsilon=0.2), optimize_op=optimize, train_interval=1, target_interval=16384, batch_size=64, min_buffer_size=20000)
def test(): env = envWrapper(make_env()()) model = learn(env, 3000, 3e4, 2e-4) total_reward = 0 for i in range(30): obs = env.reset() while True: action_index, _, _, _ = model.evaluate(torch.unsqueeze(torch.tensor(obs, dtype=torch.float).to(device), 0)) obs, reward, done = env.step(action_index) print(action_index) total_reward += reward if done: break print("{} testgames done".format(i)) total_reward_rand = 0 for i in range(30): obs = env.reset() while True: obs, reward, done = env.step(env.env.action_space.sample()) total_reward_rand += reward if done: break print("{} testgames done".format(i)) print("total_reward: {}".format(total_reward)) print("total_reward_rand: {}".format(total_reward_rand))
def main(): env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-421, max_val=421)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=2000000, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=64, batch_size=32, min_buffer_size=25000)
def main(): env = make_env() max_screen_x = 0 model = DQN.load(saved_model_file_path) obs = env.reset() fps = 60 frames_per_timestep = 4 speed_up_factor = 1.5 wait_time = frames_per_timestep / fps / speed_up_factor while True: t1 = time.time() action, _states = model.predict(obs) t2 = time.time() t3 = wait_time - (t2 - t1) if t3 > 0: time.sleep(t3) obs, rewards, done, info = env.step(action) if info['screen_x'] > max_screen_x: max_screen_x = info['screen_x'] logger.info("Max screen x: " + str(max_screen_x)) if done: env.reset() else: env.render()
def main(): training_episodes = 1000 #env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1') env = make_env(stack=False, scale_rew=False) obs = env.reset() espio = DDQN() score = 0 # training for e in range(0, training_episodes): action = env.action_space.sample() obs_prime, rew, done, info = env.step(action) score_prime = info['score'] delta_score = score_prime - score D = (obs_prime.flatten(), action, rew, delta_score, obs.flatten(), done) espio.experience_replay = np.append(espio.experience_replay, [D], axis = 0) env.render() espio.train() obs = obs_prime espio.train() score = score_prime # reset score to score_prime if done: obs = env.reset() score = 0 obs = env.reset() # reset before testing begins # finished training while True: obs, rew, done, info = env.step(env.action_space.sample()) env.render() if done: obs = env.reset()
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4) sess.run(tf.global_variables_initializer()) """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ dqn.train( num_steps=1000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) # Other exploration schedules #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3) #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 def _handle_ep(steps, rew, env_rewards): nonlocal total_steps total_steps += steps reward_hist.append(rew) if total_steps % 10 == 0: print('%d episodes, %d steps: mean of last 100 episodes=%f' % (len(reward_hist), total_steps, sum(reward_hist[-100:]) / len(reward_hist[-100:]))) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=[eps_decay_sched], handle_ep=_handle_ep, restore_path='./pretrained_model', save_interval=None, )
def main(): print('connecting to remote environment') env = make_env(stack=False) print('starting episode') env.reset() while True: obs, rew, done, info = env.step(env.action_space.sample()) print(rew, done, info) env.render() if done: print('episode complete') obs = env.reset()
def main(): env = make_env() max_screen_x = 0 model = DQN.load(saved_model_file_path) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if info['screen_x'] > max_screen_x: max_screen_x = info['screen_x'] logger.info("Max screen x: " + str(max_screen_x)) if done: env.reset() else: env.render()
def main(): print('connecting to remote environment') env = make_env(stack=False) print('starting episode') env.reset() episode_step = 0 episode_reward = 0 while True: episode_step += 1 #action = env.action_space.sample() # HilltopZone.Act1 if episode_step < 52: action = 1 elif episode_step < 63: action = 0 elif episode_step < 85: action = episode_step % 2 elif episode_step < 95: action = 1 elif episode_step < 155: action = 1 elif episode_step < 160: action = 5 else: if episode_step % 2 == 0: action = 1 else: action = 5 obs, rew, done, info = env.step(action) episode_reward += rew print(action) print(rew, done, info) print(episode_reward) env.render() if done: print('episode complete') obs = env.reset() episode_step = 0 episode_reward = 0
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "/root/compo/model.ckpt") #print('model restored') replay_buffer = pickle.load( gzip.open('/root/compo/replay_buffer.p.gz', 'rb')) replay_buffer.alpha = 0.2 replay_buffer.beta = 0.4 replay_buffer.capacity = 100000 restore_ppo2_weights(sess) dqn.train( num_steps=2000000, # Make sure an exception arrives before we stop. player=player, replay_buffer= replay_buffer, #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=4, target_interval=8192, batch_size=32, min_buffer_size=20000)
import os os.environ["CUDA_VISIBLE_DEVICES"]="0" from sonic_util import make_env from agent import Agent from collections import deque import numpy as np level_name='LabyrinthZone.Act1' env = make_env(level_name=level_name, \ stack=False, scale_rew=True) env.seed = 714 state_space = list(env.observation_space.shape) action_space = env.action_space.n print('State shape: ', state_space) print('Number of actions: ', action_space) BATCH_SIZE = 3000 EXPERIENCE_REPLAY = False BUFFER_SIZE = int(9100) agent = Agent(state_space, action_space, level_name=level_name, \ param={ 'EXPERIENCE_REPLAY': EXPERIENCE_REPLAY, 'BUFFER_SIZE': BUFFER_SIZE, 'BATCH_SIZE': BATCH_SIZE }) def add_noise(state): row,col,ch= state.shape
def train( train_id, game, level, num_processes, num_timesteps, algo_name, policy_name, is_joint, model_save_path, logs_path, hyper_opt, load_model_path=None, train_counter=0, # To be set (incrementally) when running multiple trainings short_life=False, backtracking=False, ): global global_logs_path, best_mean_reward, n_steps print("\n\nStarting training with args:\n") print(log_fun_args(locals())) print("\n") global_logs_path = logs_path best_mean_reward, n_steps = -np.inf, 0 envs = [] if is_joint: envs = [ make_env( game=game, level=level, rank=i, log_dir=logs_path, seed=train_counter * 100, short_life=short_life, backtracking=backtracking, ) for i, (game, level) in enumerate(small_train_set) ] else: envs = [ make_env( game=game, level=level, rank=i, log_dir=logs_path, seed=train_counter * 100, short_life=short_life, backtracking=backtracking, ) for i in range(num_processes) ] if num_processes == 1: env = VecFrameStack(DummyVecEnv(envs), 4) else: env = VecFrameStack(SubprocVecEnv(envs), 4) print("\n\n") algo = None if algo_name == "ppo2": algo = PPO2 elif algo_name == "a2c": algo = A2C policy = None nminibatches = 4 if policy_name == "cnn": policy = CnnPolicy elif policy_name == "cnnlstm": if is_joint: nminibatches = 5 policy = CnnLstmPolicy model = None if load_model_path: print("Loading a model...") model = algo.load(load_model_path, env=env, tensorboard_log=logs_path) else: print("Creating a new model...") if algo_name == "ppo2": if hyper_opt: model = algo( policy, env, verbose=1, tensorboard_log=logs_path, n_steps=4096, nminibatches=8, learning_rate=2e-4, ent_coef=0.01, ) else: model = PPO2( policy, env, nminibatches=nminibatches, verbose=1, tensorboard_log=logs_path, ) elif algo_name == "a2c": model = A2C(policy, env, verbose=1, tensorboard_log=logs_path) print(f"Starting training for {num_timesteps} timesteps") model.learn(total_timesteps=num_timesteps, callback=callback, log_interval=1) print("Training finished!") if model_save_path: model.save(model_save_path) print("Model saved in:\t", model_save_path) timestep_values, score_values = ts2xy(load_results(logs_path), "timesteps") score_values = score_values * 100 plot_path = os.path.join(logs_path, f"{level}.png") print("Saving the plot in: " + plot_path) save_plot(timestep_values, score_values, title=level, save_path=plot_path) env.close()
def build_env(level_name): env = make_env(stack=False, scale_rew=True, level_name=level_name) return env
def main(): parser = argparse.ArgumentParser() parser.add_argument('--restore', '-restore', action='store_true', help='restore from checkpoint file') parser.add_argument('--record', '-record', action='store_true', help='record bk2 movies') args = parser.parse_args() """Run DQN until the environment throws an exception.""" env = AllowBacktracking( make_env(stack=False, scale_rew=False, record=args.record)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) checkpoint_dir = os.path.join(os.getcwd(), 'results') results_dir = os.path.join(os.getcwd(), 'results', time.strftime("%d-%m-%Y_%H-%M-%S")) if not os.path.exists(results_dir): os.makedirs(results_dir) summary_writer = tf.summary.FileWriter(results_dir) # TODO # env = wrappers.Monitor(env, results_dir, force=True) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) saver = tf.train.Saver() if args.restore: latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {} ...\n".format( latest_checkpoint)) saver.restore(sess, latest_checkpoint) else: print("Checkpoint not found") player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) reward_hist = [] total_steps = 0 # runs with every completed episode def _handle_ep(steps, rew): nonlocal total_steps total_steps += steps reward_hist.append(rew) summary_reward = tf.Summary() summary_reward.value.add(tag='global/reward', simple_value=rew) summary_writer.add_summary(summary_reward, global_step=total_steps) print('save model') saver.save(sess=sess, save_path=checkpoint_dir + '/model', global_step=total_steps) if len(reward_hist) == REWARD_HISTORY: print('%d steps: mean=%f' % (total_steps, sum(reward_hist) / len(reward_hist))) summary_meanreward = tf.Summary() summary_meanreward.value.add(tag='global/mean_reward', simple_value=sum(reward_hist) / len(reward_hist)) summary_writer.add_summary(summary_meanreward, global_step=total_steps) reward_hist.clear() dqn.train( num_steps=7000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, handle_ep=_handle_ep)
def main(): """Run DQN until the environment throws an exception.""" print('creating env') env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 print('starting tf session') with tf.Session(config=config) as sess: print('creating agent') online_net, target_net = rainbow_models(sess, env.action_space.n, gym_space_vectorizer( env.observation_space), min_val=-200, max_val=200) dqn = DQN(online_net, target_net) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) train_steps = 5000 print('training steps:', train_steps) for j in range(1): print(j) start = time.time() dqn.train( num_steps= train_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=10000) end = time.time() print(end - start) print('done training') print('save nn') save_path = saver.save(sess, "saved_models/rainbow5.ckpt") print("Model saved in path: %s" % save_path) tvars = tf.trainable_variables() tvars_vals = sess.run(tvars) #for var, val in zip(tvars, tvars_vals): # print(var.name, val[0]) #print(tvars_vals[0][-5:]) #print('stepping') #obs = env.reset() #online_net.step(obs, obs) '''
model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=callback) model.save(saved_model_name) obs = env.reset() if __name__ == '__main__': # Setup logging logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) env = make_env() model = None print(DECORATOR) if len(argv) == 1: if isfile(saved_model_name): logging.info("Loading model from file: " + saved_model_name) model = DQN.load(saved_model_name, env=env, verbose=0, tensorboard_log=TENSORBOARD_LOG_DIR, buffer_size=REPLAY_BUFFER_SIZE) else: logging.info("Creating model from scratch...") model = DQN(CnnPolicy, env,
# Hyper Parameters OUTPUT_GRAPH = True MAX_EPISODE = 10000 DISPLAY_REWARD_THRESHOLD = 3000 # renders environment if total episode reward is greater then this threshold MAX_EP_STEPS = 4500 # maximum time step in one episode RENDER = False # rendering wastes time GAMMA = 0.99 # reward discount in TD error LR_A = 1e-9 # learning rate for actor LR_C = 1e-6 # learning rate for critic BUFFER_SIZE = 5000 BATCH_SIZE = 32 UPDATE_EVERY = 100 # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) env = make_env(stack=False, scale_rew=True) # env.seed(1) # reproducible # env = env.unwrapped state_space = list(env.observation_space.shape) action_space = env.action_space.n print('State shape: ', state_space) print('Number of actions: ', [1, action_space]) def reshape_state(s): s = s[np.newaxis, :] return s
from anyrl.algos import DQN from anyrl.envs import BatchedGymEnv from anyrl.envs.wrappers import BatchedFrameStack from anyrl.models import rainbow_models from anyrl.rollouts import BatchedPlayer, PrioritizedReplayBuffer, NStepPlayer from anyrl.spaces import gym_space_vectorizer, StackedBoxSpace import gym_remote.exceptions as gre from sonic_util import AllowBacktracking, make_env import numpy as np print('creating env') #z = StackedBoxSpace(np.zeros((84,84,1)), 4) env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) #print(env.action_space.n) #StackedBox(84,84,1) config = tf.ConfigProto() config.gpu_options.allow_growth = True print('starting tf session') with tf.Session(config=config) as sess: print('creating agent')
def build_env(): #env, multi_action = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1'), True env, multi_action = make_env(stack=False, scale_rew=False), False return env, multi_action
max_steps = 500 store_model = False RECORD_DIR = False BATCH_SIZE = 8 # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"setting device to '{device}'") # close current environment if there is one (e.g. on failure to complete last time) try: env.close() except NameError: pass # create the environment # Loading the level env = make_env(GAME_NAME, LEVEL, save_game=RECORD_DIR) # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen() _, screen_depth, screen_height, screen_width = init_screen.shape print( f"discovered input image ({screen_depth},{screen_height},{screen_width})") # Get number of actions from gym action space n_actions = env.action_space.n policy_net = FDQN(screen_depth, screen_height, screen_width, n_actions).to(device) target_net = FDQN(screen_depth, screen_height, screen_width,
# import retro from retro_contest.local import make from sonic_util import make_env import random import torch import numpy as np from collections import deque import matplotlib.pyplot as plt from tool import preprocess # Import environment and get env infor # env = retro.make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1', record=False) # env, multi_action = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1'), True env, multi_action = make_env(stack=False, scale_rew=False), False env.seed(1) state_space = list(env.observation_space.shape) action_space = env.action_space.n print('State shape: ', state_space) print('Number of actions: ', [1, action_space]) BUFFER_SIZE = int(5e3) # replay buffer size BATCH_SIZE = 16 # minibatch size GAMMA = 0.99 # discount factor TAU = 1e-3 # for soft update of target parameters LR = 1e-6 # learning rate UPDATE_EVERY = 500 # how often to update the network from dqn_agent import Agent agent = Agent(state_size=state_space, action_size=action_space,
import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv from stable_baselines import PPO2, A2C from sonic_util import make_env from gym.wrappers import Monitor env = DummyVecEnv([lambda: make_env(level_name='LabyrinthZone.Act1', \ stack=False, scale_rew=True)]) modelname = 'sonicppo' model = PPO2(CnnPolicy, env, n_steps=4500, verbose=1) model.load("./checkpoint" + modelname) obs = env.reset() done = False reward = 0 while not done: actions, _ = model.predict(obs) obs, rew, done, info = env.step(actions) reward += rew env.render() env.close()
import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv from stable_baselines import PPO2, A2C from sonic_util import make_env from gym.wrappers import Monitor env = DummyVecEnv([lambda: make_env(level_name='GreenHillZone.Act1', \ stack=False, scale_rew=True)]) modelname = 'sonicppo' model = PPO2(CnnPolicy, env, n_steps=3500, verbose=1) model.learn(total_timesteps=1000000) model.save("./checkpoint" + modelname)