def play_level(project_name, level_name, episodes=1, weighted_random=False):
    num_actions = 17
    advantage = Input(shape=(1, ), name="actor_advantage")
    old_prediction = Input(shape=(num_actions, ),
                           name="actor_previous_prediction")
    env = retro.make("SuperMarioWorld-Snes",
                     info="variables/data.json",
                     scenario="scenarios/scenario.json",
                     obs_type=retro.Observations(0))
    env.load_state(level_name)
    env = MarioDiscretizer(env)
    model_path = "learning_movies/" + project_name + "/"
    actor = load_model(model_path + "actor_model.hdf5",
                       custom_objects={
                           'loss':
                           proximal_policy_optimization_loss(
                               advantage=advantage,
                               old_prediction=old_prediction)
                       })
    for _ in range(episodes):
        done = False
        obs = env.reset()
        action_list = []
        while not done:
            p = actor.predict(obs.reshape((1, ) + env.observation_space.shape))
            if weighted_random:
                action = np.random.choice(num_actions, p=np.nan_to_num(p[0]))
            else:
                action = np.argmax(p)
            action_list.append(action)
            obs, _, done, _ = env.step(action)
        _ = env.reset()
        for act in action_list:
            env.render(mode="human")
            _, _, _, _ = env.step(act)
Ejemplo n.º 2
0
 def __init__(self,
              project_name=None,
              game='SuperMarioWorld-Snes',
              state=None,
              scenario=None,
              observation_type=None,
              record=None,
              variables=None,
              *args,
              **kwargs):
     """
     Args:
         project_name (str): Name of project and directory to save recordings in
         game (str): Name of ROM to load, defaults to SMW for the SNES
         state (str): Name of level (or state) to load for training
         scenario (str): Path to json file describing the environment and parameters for fitness
         observation_type (int or bool): 0 or 1. 0 for screen observations or 1 for memory state (2D vs 1D)
         record (str): Path to specific directory within project dir to save recordings to
         variables (str): Path to json with memory mapping of in-game variables
     
     """
     self.project_name = project_name
     self.game = game
     self.state = state
     self.scenario = scenario
     self.variables = variables
     if isinstance(observation_type, bool):
         observation_type = int(observation_type)
     self.observation_type = retro.Observations(observation_type)
     self.record_path = self._fix_record_path(record)
Ejemplo n.º 3
0
 def __init__(self, project_name, game, scenario, variables,
              observation_type, record):
     self.project_name = project_name
     self.game = game
     self.scenario = scenario
     self.variables = variables
     self.observation_type = retro.Observations(observation_type)
     self.record_path = self._fix_record_path(record)
     self.isVectorized = False
     self.max_episode_steps = 5000
    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)
        self.IS_COLOR = True
        self.observation_type = retro.Observations(
            0)  # Must be 0 for image observation
        self.env = self.make_env()
        self.env = StochasticFrameSkip(
            self.env, n=4, stickprob=0.5
        )  # Wraps env to randomly (stickprob) skip frames (n), cutting down on training time
        #The following wrappers are here to cut down on training time even further, if desired
        #self.env = Downsample(self.env, ratio=2) # Divides each side of image by 2, thus cutting down total pixels by 4x
        #self.env, self.IS_COLOR = Rgb2gray(self.env), False

        self.episode = 0
        self.observation = self.env.reset()
        self.reward = []
        self.reward_over_time = {}
        self.actor_critic_losses = [{}, {}]

        self.MAX_EPISODES = 100  # Number of episodes to train over
        self.LOSS_CLIPPING = 0.2  # Only implemented clipping for the surrogate loss, paper said it was best
        self.EPOCHS = 10  # Number of Epochs to optimize on between episodes
        self.ACTIVATION = "tanh"  # Activation function to use in the actor/critic networks
        self.GAMMA = 0.85  # Used in reward scaling, 0.99 says rewards are scaled DOWN by 1% (try 0.01 on this)
        self.BUFFER_SIZE = 64  # Number of actions to use in an analysis
        self.BATCH_SIZE = 8  # Batch size when fitting network. Smaller batch size = more weight updates.
        # Batch size should be both < BUFFER_SIZE and a factor of BUFFER_SIZE
        self.NUM_ACTIONS = 17  # Total number of actions in the action space
        if self.IS_COLOR:
            self.NUM_STATE = (224, 256, 3)  # Image size for input
        else:
            self.NUM_STATE = (224, 256, 1)
        self.NUM_FILTERS = 8  # Preliminary number of filters for the layers in agent/critic networks
        self.HIDDEN_SIZE = 8  # Number of neurons in actor/critic network final dense layers
        self.NUM_LAYERS = 2  # Number of convolutional layers in the agent and critic networks
        self.ENTROPY_LOSS = 1e-3  # Variable in loss function, helps loss scale properly
        self.LEARNING_RATE = 1e-4  # Lower lr stabilises training greatly

        # These are used as action/prediction placeholders
        self.DUMMY_ACTION = np.zeros((1, self.NUM_ACTIONS))
        self.DUMMY_VALUE = np.zeros((1, 1))

        self.critic = self.build_critic()
        self.actor = self.build_actor()
    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

        self.observation_type = retro.Observations(
            1)  # Must be 1 for numerical observation
        self.env = self.make_env()
        self.env = StochasticFrameSkip(
            self.env, n=4, stickprob=0.5
        )  # Wraps env to randomly (stickprob) skip frames (n), cutting down on training time

        self.episode = 0
        self.observation = self.env.reset()
        self.reward = []
        self.reward_over_time = {}
        self.actor_critic_losses = [{}, {}]
        self.gradient_steps = 0

        self.MAX_EPISODES = 10  # Number of episodes to train over
        self.LOSS_CLIPPING = 0.2  # Only implemented clipping for the surrogate loss, paper said it was best
        self.EPOCHS = 10  # Number of Epochs to optimize on between episodes
        self.ACTIVATION = "tanh"  # Activation function to use in the actor/critic networks
        self.GAMMA = 0.85  # Used in reward scaling, 0.99 says rewards are scaled DOWN by 1%
        self.BUFFER_SIZE = 4096  # Number of actions to use in an analysis
        self.BATCH_SIZE = 64  # Batch size when fitting network. Smaller batch size = more weight updates.
        # Batch size should be both < BUFFER_SIZE and a factor of BUFFER_SIZE
        self.NUM_ACTIONS = 17  # Total number of actions in the action space
        self.NUM_STATE = 141312  # Total number of inputs from the environment (i.e. the observation space)
        self.HIDDEN_SIZE = 24  # Number of neurons in actor/critic network layers
        self.NUM_LAYERS = 1  # Number of layers in the actor and critic networks
        self.ENTROPY_LOSS = 1e-3  # Variable in loss function, helps loss scale properly
        self.LEARNING_RATE = 1e-4  # Lower lr stabilises training greatly

        # These are used as action/prediction placeholders
        self.DUMMY_ACTION = np.zeros((1, self.NUM_ACTIONS))
        self.DUMMY_VALUE = np.zeros((1, 1))

        self.critic = self.build_critic()
        self.actor = self.build_actor()