Beispiel #1
0
def create_environment():
    """Creates the environment, applies some wrappers and returns it."""
    tmp_env = gym_super_mario_bros.make(LEVEL_NAME)
    tmp_env = JoypadSpace(tmp_env, ACTION_SPACE)
    tmp_env = wrapper(tmp_env, FRAME_DIM)

    return tmp_env
def record_one_episode(agent, episode):
    tmp_env = gym_super_mario_bros.make(LEVEL_NAME)
    tmp_env = JoypadSpace(tmp_env, ACTION_SPACE)
    tmp_env = Monitor(tmp_env, './videos/video-episode-{0:05d}'.format(episode), force=True)
    tmp_env = wrapper(tmp_env, FRAME_DIM, FRAME_SKIP)

    state = lazy_frame_to_tensor(tmp_env.reset())

    total_reward = 0
    while True:
        action = agent.get_action(state)

        next_state, reward, done, info = tmp_env.step(action)
        next_state = lazy_frame_to_tensor(next_state)

        if done:
            break

        total_reward += reward
        state = next_state
Beispiel #3
0
def record_one_episode(agent):
    tmp_env = gym_super_mario_bros.make(LEVEL_NAME)
    tmp_env = JoypadSpace(tmp_env, ACTION_SPACE)
    tmp_env = Monitor(tmp_env, './video', force=True)
    tmp_env = wrapper(tmp_env, FRAME_DIM)

    state = lazy_frame_to_tensor(tmp_env.reset())

    total_reward = 0
    while True:
        action, _ = agent.select_action_based_on_state(state)

        next_state, reward, done, info = tmp_env.step(action)
        next_state = lazy_frame_to_tensor(next_state)

        if done:
            break

        total_reward += reward

        state = next_state
Beispiel #4
0
#from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from agent import DQNAgent
from wrappers import wrapper
from utils import get_args


# Take argument
arg = get_args()

# Build env (first level, right only)
env = gym_super_mario_bros.make(arg.env)
env = JoypadSpace(env, RIGHT_ONLY)
env = wrapper(env)
# Parameters
states = (84, 84, 4)
actions = env.action_space.n

# Agent
agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True)

# Episodes
# episodes = 100001
episodes = 101
rewards = []

# Timing
start = time.time()
step = 0
Beispiel #5
0
import wrappers
act = wrappers.wrapper("mind2")
Beispiel #6
0
def train_model(parameters):
    #Initialization of environment and agent
    env = gym_super_mario_bros.make(parameters['environment'])
    env = JoypadSpace(env, RIGHT_ONLY)
    env = wrapper(env)

    states = (84, 84, 4)
    actions = env.action_space.n
    
    agent = DDQNagent(parameters, states, actions)

    if parameters['train']:
        #TENSORBOARD
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")    
        log_dir = 'logs/mario/' + current_time + '/10k'    
        summary_writer = tf.summary.create_file_writer(log_dir)    
        summary_writer.set_as_default()
        
        
        maxXpos = 0    # Maximum X position of the Agent
        max_reward = 0    # Maximum reward
        start_time = time.time()    # Start time
        
        #Initialization of varialbes for plots
        graph_reward = np.zeros(parameters['episodes_to_play'])    # Reward
        graph_pos = np.zeros(parameters['episodes_to_play'])    # Pozition
        graph_mean_reward = np.zeros(parameters['episodes_to_play'])    # Mean Reward
    
    
        episodes = parameters['episodes_to_play']    # Number of episodes to train
        rewards = []    # Rewards array
        
        
        start = time.time()    # Time for calculating processed frames per second
        step = 0    # Total steps
        
        #Lerning cycle
        for e in range(episodes):
    
            #Default state of the environment
            state = env.reset()
        
            total_reward = 0    # Reward gained for actual epsiode
            iter = 0
        
            while True:
                #Select an action
                action = agent.run(state)
        
                #Apply action to environment
                next_state, reward, done, info = env.step(action)
    
                #Write new data to memory
                agent.update_memory(experience=(state, next_state, action, reward, done))
    
                #Learn
                agent.learn()
        
                #Sum of rewards for every action
                total_reward += reward
        
                #Change current state to next one
                state = next_state
        
                iter += 1
                
                #Render
                if parameters['render']:
                    env.render()
        
                #Check finish condition
                if done or info['flag_get']:
                    break
        
            #New data for variable that be used for plot
            rewards.append(total_reward / iter)
            
            #Update info
            if maxXpos < info['x_pos']:
                maxXpos = info['x_pos']
            if max_reward < total_reward:
                max_reward = total_reward
            
            if info['flag_get'] == True:
                agent.flag_reached = agent.flag_reached + 1
            
            #Epsilon decay
            if agent.eps >= 0.0:
                agent.eps = agent.eps - agent.eps_decay
            
            #Updtate variables for plots
            graph_reward[e] = total_reward
            graph_pos[e] = info['x_pos']
            graph_mean_reward[e] = np.mean(graph_reward)
            
            #TENSORBOARD
            tf.summary.scalar("Rewards", total_reward, step=e)
            tf.summary.scalar("Position", info['x_pos'], step=e)
            tf.summary.scalar("Mean reward", np.mean(graph_reward), step=e)
            tf.summary.scalar("Flags", agent.flag_reached, step=e)
            tf.summary.scalar("Loss", agent.loss, step=e)
            
            
            #Console information
            print("Episode reward: " + str(total_reward) + ' - Pos: ' + str(info['x_pos']))
            # Print
            if e % 10 == 0:
                end = time.time()
                print('Flags reached: ' + str(agent.flag_reached) + ' - Max reward: ' +str(max_reward))
                print('Episode {e} - '
                      'Frame {f} - '
                      'Frames/sec {fs} - '
                      'Epsilon {eps} - '
                      'Mean Reward {r} - '
                      'Time {t} sec - '
                      'Max pos {pos}'.format(e=e,
                                               f=agent.step,
                                               fs=np.round((agent.step - step) / (time.time() - start)),
                                               eps=np.round(agent.eps, 4),
                                               r=np.mean(rewards[-100:]),
                                               t=round(end - start_time),
                                               pos=maxXpos))
    
    
                start = time.time()    
                step = agent.step    
        
        #After learning draw plots and save weights
        draw_graph(graph_reward,'Rewards')
        draw_graph(graph_pos, 'Position')
        draw_graph(graph_mean_reward, 'Mean reward')
        agent.save_weights()
        env.close() 
        
    else:
        #If train is equal to false, it is possible to load weights and observe result
        print('Weights file path (hdf5): ')
        weights_name = input()
        try:
            agent.model_target.load_weights(weights_name)
            agent.model_test(env)
        except:
            print("Weights with this name or on this path not found")
        env.close()
Beispiel #7
0
        current_states.append(current_state)
        target_predictions.append(target_prediction)

    # train the policy model based on the predictions of the target model
    policy_model.fit(np.asarray(current_states).squeeze(),
                     np.asarray(target_predictions).squeeze(),
                     batch_size=BATCH_SIZE,
                     verbose=0)


env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(
    env, ACTION_SPACE
)  # An environment wrapper to convert binary to discrete action space.
# apply the wrapper
env = wrapper(env, FRAME_DIM)

# create the network
policy_net = DQNetwork(stacked_frame_dim=FRAME_DIM,
                       num_actions=env.action_space.n)
target_net = DQNetwork(stacked_frame_dim=FRAME_DIM,
                       num_actions=env.action_space.n)

# create the replay memory
replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY)

# play the episodes
current_exploration = EXPLORATION_MAX
total_steps = 0
reward_history = []
mean_reward_history = []
Beispiel #8
0
import wrappers
act = wrappers.wrapper("mind1")
Beispiel #9
0
import wrappers
act = wrappers.wrapper("ben")