def run(self, solution, level, render, mode):
		env = gym_super_mario_bros.make(level)
		env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

		done = True
		reason_finish = "no_more_commands"

		pos = 0
		total_r = 0

		for step in range(len(solution)):
			if done:
				state = env.reset()

			state, reward, done, info = env.step(solution[pos])
			pos+=1


			if reward == -15: #faleceu
				reason_finish = "death"
				break

			if mode == "level" and info['flag_get'] == True:
				reason_finish = "win"
				break

			total_r = total_r + reward
			if render == "true":
				env.render()


		env.close()
		return total_r, pos, info, reason_finish
Exemple #2
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    #env = make_atari(env_id)

    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    # env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3')

    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    env = ProcessFrame84(env)

    env = FrameMemoryWrapper(env)




    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    #env = wrap_deepmind(env)
    env.seed(workerseed)


    def render_callback(lcl, _glb):
        # print(lcl['episode_rewards'])
        total_steps = lcl['env'].total_steps
        #if total_steps % 1000 == 0:
        #    print("Saving model to mario_model.pkl")
        #    act.save("../models/mario_model_{}.pkl".format(modelname))


        env.render()
        # pass


    pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=2048,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4,
        optim_stepsize=1e-3, # 3e-4
        optim_batchsize=64, #256
        gamma=0.99, lam=0.95,
        schedule='linear',
        callback = render_callback
    )
    env.close()
Exemple #3
0
def mariocontext(marioEnv):
    mario_env = 'SuperMarioBros' + marioEnv.noFrameSkip + '-' + str(
        marioEnv.world) + '-' + str(marioEnv.stage) + '-v' + str(
            marioEnv.version)
    env = gym_super_mario_bros.make(mario_env)
    env = BinarySpaceToDiscreteSpaceEnv(env, marioEnv.action_encoding)
    yield env
    env.close()
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    #parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-freq', type=int, default=10)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)

    #env = make_atari(args.env)
    env = gym_super_mario_bros.make('SuperMarioBros-v3')
    #env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    env = ProcessFrame84(env)

    print("logger.get_dir():", logger.get_dir())
    print("PROJ_DIR:", PROJ_DIR)

    env = bench.Monitor(env, logger.get_dir())
    #env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        #        checkpoint_path=args.checkpoint_path,
        #callback=deepq_callback,
        print_freq=1)
    print("Saving model to mario_model.pkl")
    act.save("../models/mario_model_{}.pkl".format(
        datetime.datetime.now().isoformat()))

    env.close()
Exemple #5
0
class MarioEnv:
    def __init__(self, os='mac', display=False):
        self.display = display
        if os == 'mac' or os == 'linux':
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        else:
            raise Exception("bad os")
        self.act_dim = self.env.action_space.n
        self.obs_dim = (1, 128, 128)
        print("env created with act_dim", self.act_dim, "obs_dim",
              self.obs_dim)
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # chain 2 transforms together using list.
            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
        ])

    def reset(self):
        state = self.env.reset()
        return self.__resize_image(state)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        if reward == 0:
            reward = -0.5
        state_t = self.__resize_image(state)
        return state_t, \
               np.reshape(reward, -1), \
               np.reshape(done, -1)

    def close(self):
        self.env.close()

    def __resize_image(self, state):
        state_new = cv2.resize(state, (128, 128))
        img = Image.fromarray(state_new)
        state_t = self.transform(img)[0, :, :].unsqueeze(0)
        state_t = state_t.float().to(DEVICE)
        return state_t.unsqueeze(0)

    def render(self):
        if self.display:
            self.env.render()
Exemple #6
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    done = True
    max_step = 5000
    print(env.observation_space.shape)
    #win下加ascii=True才会不换行
    qbar = tqdm(max_step, ascii=True)
    for step in range(max_step):
        qbar.update()
        if done:
            state = env.reset()
        action = get_action(state, env.action_space)
        state, reward, done, info = env.step(action)
        if done:
            print(str(step) + " 英雄请卷土重来" + str(info))
        env.render()
    env.close()
    qbar.close()
Exemple #7
0
## Base model to run the game, using random movements
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
from aux import *
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

done = True
oldi = {
    'coins': 0,
    'flag_get': False,
    'life': 2,
    'score': 0,
    'stage': 1,
    'status': 'small',
    'time': 400,
    'world': 1,
    'x_pos': 40
}
for step in range(100):
    if done:
        state = env.reset()
    state, rwd, done, info = env.step(1)  #env.action_space.sample())
    print(reward(info, oldi), "vs", rwd)
    print(env.observation_space.shape)
    oldi = info
    env.render()

env.close()
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15

    epsilon = 0.0

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))

    max_steps = 5000
    num_eps = 1

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            state = next_state

            env.render()
            time.sleep(0.03)

            if done:
                break

    env.close()
Exemple #9
0
def replay_genome(genome, movements, gen):
    env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1,
                                                          rom_mode='vanilla')
    env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements)

    print('Number of genes: ', len(genome.connection_genes))
    for gene in genome.connection_genes:
        print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number,
              gene.type, gene.enabled)

    done = True
    unticked = 0
    tick_interval = 1 / 30
    last_tick_time = time.time()

    fps = 0
    frames = 0
    last_fps_time = time.time()

    for _ in range(500000):

        unticked += time.time() - last_tick_time
        last_tick_time = time.time()
        ticked = False

        # while unticked >= tick_interval:
        if done:
            state = env.reset()

        state_downscaled = get_sensor_map(env_expanded)

        action = genome.calculate_action(state_downscaled)

        # print('\rFPS: {:.3f}'.format(fps), end=' ')
        # print(vectofixedstr(action, 10), end=' ')
        action = np.argmax(action)
        print('\rtaking action', movements[action], end='', flush=True)

        state, reward, done, info = env.step(action)

        #filename = get_path_of('all_pictures/mario/')
        #imsave(filename + 'mario_' + str(_) + '.png', state)

        save_state = np.full((13, 10, 3), 255, dtype=np.int)

        COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]]

        for i in range(13):
            for j in range(10):
                if state_downscaled[(i, j)] == -1:
                    save_state[(i, j)] = COLORS[3]
                elif state_downscaled[(i, j)] == 0:
                    save_state[(i, j)] = COLORS[0]
                else:
                    save_state[(i, j)] = COLORS[1]

        save_state[(7, 2)] = COLORS[2]

        # filename = get_path_of('all_pictures/input_downscaled/')
        # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8))

        # make_controller(movements[action], _, gen)

        env.render()

        if info["life"] <= 2:
            died = True
            break

        ticked = True
        frames += 1
        unticked -= tick_interval

        # if ticked:
        #     now = time.time()
        #     if now - last_fps_time >= 1:
        #         fps = frames / (now - last_fps_time)
        #         last_fps_time = now
        #         frames = 0
        # else:
        #     time.sleep(0.001)

    env.close()
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    # width = 84
    # resize_height = 110
    # final_height = 84
    width=128
    resize_height = 168
    final_height = 128
    size = [channels, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'

    gamma = 0.95

    start_epsilon = 0.3
    stop_epsilon = 0.01
    epsilon_decay = 0.00025

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    data_file = 'data_loader'
    model_file = 'mario_agent'
    continue_train = True
    model.load_state_dict(torch.load(model_file))

    if continue_train:
        target_model.load_state_dict(torch.load(model_file))

    lr = 0.00005
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file ='total_reward.txt'


    if not continue_train:
        with open(total_reward_file, 'w') as f:
            f.write('Reward\tSteps\n')


    max_steps = 5000
    num_eps = 5000

    if continue_train:
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            data.batch_size = batch_size
    else:
        data = dataset(replay_capacity, batch_size, replay_dir, size)

        #initialize memory with 100 experiences
        done = True
        for i in range(100):
            if done:
                state = env.reset()
                state = preprocess(state, [resize_height, width], final_height)
                state = torch.cat((state, state, state, state))

            action = random.randint(0,len(movement)-1)
            next_state, reward, done, info = env.step(int(action))

            # if reward>0:
            #     reward = 1
            # else:
            #     reward = -1
            reward /= 15
            if reward == 0:
                reward = -0.1

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)

            state = next_state


    tau = 0
    max_tau = 2000
    decay_step = 0
    farthest = 3000
    cur_x = 1

    #training loop
    for episode in range(num_eps):
        print('Episode {}'.format(episode+1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            tau += 1


            #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step)
            epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest)))
            if epsilon < stop_epsilon:
                epsilon = stop_epsilon

            if random.random() < epsilon:
                action = random.randint(0,len(movement)-1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            cur_x = info['x_pos']

            if cur_x > farthest:
                farthest = cur_x

            # if reward > 0:
            #     reward = 1
            # else:
            #     reward = -1

            reward /= 15
            if reward == 0:
                reward = -0.1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:,:,:], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            batch = data.get_batch(model, target_model, device, gamma)
            loss, abs_err = train(model, device, optimizer, batch)

            data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err)))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        decay_step += step
        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(data_file, 'wb') as f:
                pickle.dump(data, f)


    env.close()
Exemple #11
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    movement.append(['B'])
    movement.append(['down'])
    movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    width = 84
    resize_height = 110
    final_height = 84
    size = [channels, final_height, width]

    batch_size = 32
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'
    epsilon = 1
    gamma = 0.9

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 5000
    num_eps = 1000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width],
                                    final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, device, gamma))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if done:
                with open(total_reward_file, 'a') as f:
                    f.write('{}\t{}\n'.format(episode_reward, step))

                break

        epsilon -= (1 / num_eps)
        if episode % 10 == 0:
            target_model.load_state_dict(model.state_dict())

            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
           kernel_size=(5, 5),
           strides=(1, 1),
           activation='relu',
           input_shape=image_shape))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Conv2D(64, (5, 5), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Flatten())
model.add(Dense(7, activation='linear'))
model.compile(loss='mse',
              optimizer=Adam(clipnorm=10, lr=1e-4, decay=1e-6, epsilon=1e-4))
"""
done = True
for step in range(5000):
	if done:
		state = env.reset()
	state, reward, done, info = env.step(env.action_space.sample())
	print(env.action_space.samp)

env.close()
"""
episodes = 1000
gamma = 0.99
epsilon = 0.1
reward_count = 0
reward_history = []
max_episode_reward = 0
Exemple #13
0
class Agent:
    def __init__(self, level_name):
        self.level_name = level_name
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(
            np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()

        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)

        # instantiate memory
        self.memory = Memory(max_size=memory_size)

        # initialize deque with zero images
        self.stacked_frames = deque(
            [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)],
            maxlen=4)

        for i in range(pretrain_length):
            # If it's the first step
            if i == 0:
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(
                self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state

        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)

        self.write_op = tf.summary.merge_all()

    def predict_action(self, sess, explore_start, explore_stop, decay_rate,
                       decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (
            explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output,
                          feed_dict={
                              self.DQNetwork.inputs_:
                              state.reshape((1, *state.shape))
                          })

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability

    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(),
                                           animate,
                                           frames=len(frames),
                                           interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode='rgb_array'))

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)

    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()

    def train(self):
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(
                        sess, explore_start, explore_stop, decay_rate,
                        decay_step, state, self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, "Total reward:",
                              total_reward, "Explore P:", explore_probability,
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch],
                                              ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(
                        self.DQNetwork.output,
                        feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(
                            Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run(
                        [self.DQNetwork.loss, self.DQNetwork.optimizer],
                        feed_dict={
                            self.DQNetwork.inputs_: states_mb,
                            self.DQNetwork.target_Q: targets_mb,
                            self.DQNetwork.actions_: actions_mb
                        })

                    # write tf summaries
                    summary = sess.run(self.write_op,
                                       feed_dict={
                                           self.DQNetwork.inputs_: states_mb,
                                           self.DQNetwork.target_Q: targets_mb,
                                           self.DQNetwork.actions_: actions_mb
                                       })
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess,
                                    "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
Exemple #14
0
class MarioEnvironment(AbstractEnvironment):
    """
    Standard Super Mario Bros Environment
    https://github.com/Kautenja/gym-super-mario-bros
    """
    def __init__(self,
                 game_name,
                 task_name,
                 action_mode=SIMPLE_MOVEMENT,
                 state_size=None):
        """
        Args:
        game_name : string
        game_name = name of the game (e.g. SuperMarioBros-5-1-v0)
        task_name : string
        task_name = name of the task
        state_size : list or tuple or None
        state_size = size of state, [h, w] or [h, w, c]
        """
        self.game_name = game_name
        self.task_name = task_name
        self.action_mode = action_mode
        self.env = gym_super_mario_bros.make(game_name)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, self.action_mode)
        self.n_action = self.env.action_space.n
        self.actions = [a for a in range(self.n_action)]
        self.new_episode()

    def get_state(self, setting=None):
        """
        Get Current State
        Args:
        setting : dictionary
        setting = setting for states
            'resolution' : list or tuple or None
            'resolution' = resolution of states, [h, w, c] or [h, w]
        Returns:
        state : numpy.ndarray
        state = current screen, shape [h, w, c], values locate at [0, 1]
        """
        if (setting is None or ('resolution' not in setting.keys())):
            resolution = self.state_size
        else:
            resolution = setting['resolution']
        normalized = False
        if (len(resolution) == 3 and resolution[2] == 1):
            state = rgb2grey(self.ob)
            normalized = True
        else:
            state = self.ob
        if (state.ndim == 2):
            state = np.expand_dims(state, axis=-1)
        assert (state.ndim == 3), 'shape of screen should be [h, w, c]'
        state = resize(state, resolution[:2], preserve_range=True)
        state = state.astype(np.float)
        if (not normalized):
            state /= 255.
        return state

    def apply_action(self, action, num_repeat):
        """
        Apply Actions To The Environment And Get Reward
        Args:
        action : int
        action = applied action
        num_repeat : int
        num_repeat = number of repeated actions
        Returns:
        reward : float
        reward = reward of last action
        """
        assert (not self.done), 'The episode is done'
        reward = 0
        for _ in range(num_repeat):
            self.ob, reward, self.done, _ = self.env.step(action)
            self.score += reward
            if (self.done):
                break
        reward = reward_reshape(reward, self.game_name, self.task_name)
        return reward

    def new_episode(self):
        """
        Start A New Episode
        """
        self.ob = self.env.reset()
        self.done = False
        self.score = 0

    def episode_end(self):
        """
        Check If The Episode Ends
        Returns:
        ep_end : bool
        ep_end = when the episode finishes, return True
        """
        return self.done

    def action_set(self):
        """
        Get Actions Set
        Returns:
        actions : list
        actions = list of actions
        """
        return self.actions

    def available_action(self):
        """
        Get Indices of Available Actions For Current State
        Returns:
        available_ind : list
        available_ind = indices of available action
        """
        return range(self.actions)

    def episode_total_score(self):
        """
        Get Total Score For Last Episode
        """
        return self.score

    def close(self):
        """
        Close The Environment
        """
        self.env.close()
        return True
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15
    size = [channels * frames, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home-local/bayrakrg/mario_replay/'
    start_epsilon = 1.0
    stop_epsilon = 0.01
    epsilon_decay = 0.00005
    gamma = 0.75

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 500
    num_eps = 10000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    tau = 0
    max_tau = 10000
    decay_step = 0

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0
        episode_reward = 0

        for step in range(max_steps):
            tau += 1
            decay_step += 1

            epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp(
                -epsilon_decay * decay_step)

            if random.random() < epsilon:
                action = random.randint(0, len(movement) - 1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if step == max_steps - 1:
                reward -= 10

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, target_model, device, gamma))

            state = next_state

            env.render()

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
Exemple #16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--dueling', type=int, default=0)
    #parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default='/.')

    args = parser.parse_args()
    # TODO change logging dir for tensorboard
    #logger.configure(dir=None, format_strs='stdout,log,csv,json,tensorboard')
    #logger.configure(dir=None, format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard'])
    timestart = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M:%S')
    logger.configure(
        dir=PROJ_DIR + "/../tensorboard/" + str(timestart),
        format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard'])
    logger.set_level(logger.INFO)
    set_global_seeds(args.seed)

    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    #wrap environment
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    #record videos of an episode
    env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50)
    #the agent has only one trial
    env = EpisodicLifeEnv(env)

    # nes_py
    #preprocess the input frame
    env = DownsampleEnv(env, (84, 84))
    #set death penalty
    env = PenalizeDeathEnv(env, penalty=-25)
    #Stack 4 Framse as input
    env = FrameStackEnv(env, 4)

    #print tensorboard log information
    print("logger.get_dir():", logger.get_dir())
    print("PROJ_DIR:", PROJ_DIR)

    act = None
    #enable output in the terminal
    env = bench.Monitor(env, logger.get_dir())

    modelname = datetime.datetime.now().isoformat()

    #define callback function for the training process
    def render_callback(lcl, _glb):
        # print(lcl['episode_rewards'])
        total_steps = lcl['env'].total_steps
        #if total_steps % 2000 == 0:

        env.render()
        # pass


#different models with different parameters. out commented
#CNN built deepq.models.with cnn_to_mlp(params)
#trained with deepq.learn(params)

#2018-08-12-10:25:50 model 4, 100k, lr 0.0005, alpha 0.6, gamma 0.99, 8 frames v1
#2018-08-12-11:31:59 model 4, 100k, lr 0.0005, alpha 0.8, gamma 0.99, 6 frames v1

# model 04
# nature human paper + Improvements
# Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets

    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2),
               (64, 3, 1)],  # (num_outputs, kernel_size, stride)
        hiddens=[512],  # 512
        dueling=bool(1),
    )

    act = deepq.learn(
        env,
        q_func=model,
        lr=0.0001,  # 0.00025 1e-4
        max_timesteps=int(100000),  # 100k -> 3h
        buffer_size=50000,  # 5000, #10000
        exploration_fraction=0.3,  # 0.1,
        exploration_final_eps=0.1,  # 0.01
        train_freq=4,  # 4
        learning_starts=25000,  # 10000
        target_network_update_freq=1000,
        gamma=0.5,  #0.99,
        prioritized_replay=bool(1),
        prioritized_replay_alpha=0.2,
        checkpoint_freq=args.checkpoint_freq,
        #        checkpoint_path=args.checkpoint_path,
        callback=render_callback,
        print_freq=1)

    print("Saving model to mario_model.pkl " + timestart)
    act.save("../models/mario_model_{}.pkl".format(timestart))

    env.close()