def play_level(project_name, level_name, episodes=1, weighted_random=False): num_actions = 17 advantage = Input(shape=(1, ), name="actor_advantage") old_prediction = Input(shape=(num_actions, ), name="actor_previous_prediction") env = retro.make("SuperMarioWorld-Snes", info="variables/data.json", scenario="scenarios/scenario.json", obs_type=retro.Observations(0)) env.load_state(level_name) env = MarioDiscretizer(env) model_path = "learning_movies/" + project_name + "/" actor = load_model(model_path + "actor_model.hdf5", custom_objects={ 'loss': proximal_policy_optimization_loss( advantage=advantage, old_prediction=old_prediction) }) for _ in range(episodes): done = False obs = env.reset() action_list = [] while not done: p = actor.predict(obs.reshape((1, ) + env.observation_space.shape)) if weighted_random: action = np.random.choice(num_actions, p=np.nan_to_num(p[0])) else: action = np.argmax(p) action_list.append(action) obs, _, done, _ = env.step(action) _ = env.reset() for act in action_list: env.render(mode="human") _, _, _, _ = env.step(act)
def __init__(self, project_name=None, game='SuperMarioWorld-Snes', state=None, scenario=None, observation_type=None, record=None, variables=None, *args, **kwargs): """ Args: project_name (str): Name of project and directory to save recordings in game (str): Name of ROM to load, defaults to SMW for the SNES state (str): Name of level (or state) to load for training scenario (str): Path to json file describing the environment and parameters for fitness observation_type (int or bool): 0 or 1. 0 for screen observations or 1 for memory state (2D vs 1D) record (str): Path to specific directory within project dir to save recordings to variables (str): Path to json with memory mapping of in-game variables """ self.project_name = project_name self.game = game self.state = state self.scenario = scenario self.variables = variables if isinstance(observation_type, bool): observation_type = int(observation_type) self.observation_type = retro.Observations(observation_type) self.record_path = self._fix_record_path(record)
def __init__(self, project_name, game, scenario, variables, observation_type, record): self.project_name = project_name self.game = game self.scenario = scenario self.variables = variables self.observation_type = retro.Observations(observation_type) self.record_path = self._fix_record_path(record) self.isVectorized = False self.max_episode_steps = 5000
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.IS_COLOR = True self.observation_type = retro.Observations( 0) # Must be 0 for image observation self.env = self.make_env() self.env = StochasticFrameSkip( self.env, n=4, stickprob=0.5 ) # Wraps env to randomly (stickprob) skip frames (n), cutting down on training time #The following wrappers are here to cut down on training time even further, if desired #self.env = Downsample(self.env, ratio=2) # Divides each side of image by 2, thus cutting down total pixels by 4x #self.env, self.IS_COLOR = Rgb2gray(self.env), False self.episode = 0 self.observation = self.env.reset() self.reward = [] self.reward_over_time = {} self.actor_critic_losses = [{}, {}] self.MAX_EPISODES = 100 # Number of episodes to train over self.LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best self.EPOCHS = 10 # Number of Epochs to optimize on between episodes self.ACTIVATION = "tanh" # Activation function to use in the actor/critic networks self.GAMMA = 0.85 # Used in reward scaling, 0.99 says rewards are scaled DOWN by 1% (try 0.01 on this) self.BUFFER_SIZE = 64 # Number of actions to use in an analysis self.BATCH_SIZE = 8 # Batch size when fitting network. Smaller batch size = more weight updates. # Batch size should be both < BUFFER_SIZE and a factor of BUFFER_SIZE self.NUM_ACTIONS = 17 # Total number of actions in the action space if self.IS_COLOR: self.NUM_STATE = (224, 256, 3) # Image size for input else: self.NUM_STATE = (224, 256, 1) self.NUM_FILTERS = 8 # Preliminary number of filters for the layers in agent/critic networks self.HIDDEN_SIZE = 8 # Number of neurons in actor/critic network final dense layers self.NUM_LAYERS = 2 # Number of convolutional layers in the agent and critic networks self.ENTROPY_LOSS = 1e-3 # Variable in loss function, helps loss scale properly self.LEARNING_RATE = 1e-4 # Lower lr stabilises training greatly # These are used as action/prediction placeholders self.DUMMY_ACTION = np.zeros((1, self.NUM_ACTIONS)) self.DUMMY_VALUE = np.zeros((1, 1)) self.critic = self.build_critic() self.actor = self.build_actor()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.observation_type = retro.Observations( 1) # Must be 1 for numerical observation self.env = self.make_env() self.env = StochasticFrameSkip( self.env, n=4, stickprob=0.5 ) # Wraps env to randomly (stickprob) skip frames (n), cutting down on training time self.episode = 0 self.observation = self.env.reset() self.reward = [] self.reward_over_time = {} self.actor_critic_losses = [{}, {}] self.gradient_steps = 0 self.MAX_EPISODES = 10 # Number of episodes to train over self.LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best self.EPOCHS = 10 # Number of Epochs to optimize on between episodes self.ACTIVATION = "tanh" # Activation function to use in the actor/critic networks self.GAMMA = 0.85 # Used in reward scaling, 0.99 says rewards are scaled DOWN by 1% self.BUFFER_SIZE = 4096 # Number of actions to use in an analysis self.BATCH_SIZE = 64 # Batch size when fitting network. Smaller batch size = more weight updates. # Batch size should be both < BUFFER_SIZE and a factor of BUFFER_SIZE self.NUM_ACTIONS = 17 # Total number of actions in the action space self.NUM_STATE = 141312 # Total number of inputs from the environment (i.e. the observation space) self.HIDDEN_SIZE = 24 # Number of neurons in actor/critic network layers self.NUM_LAYERS = 1 # Number of layers in the actor and critic networks self.ENTROPY_LOSS = 1e-3 # Variable in loss function, helps loss scale properly self.LEARNING_RATE = 1e-4 # Lower lr stabilises training greatly # These are used as action/prediction placeholders self.DUMMY_ACTION = np.zeros((1, self.NUM_ACTIONS)) self.DUMMY_VALUE = np.zeros((1, 1)) self.critic = self.build_critic() self.actor = self.build_actor()