コード例 #1
0
def Environment():
    env = gym_super_mario_bros.make(ENV_NAME)
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = Reward(env)
    env = SkipFrame(env)
    return env
コード例 #2
0
import gym_super_mario_bros
from random import random, randrange
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from gym import wrappers

env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, RIGHT_ONLY)
env = wrappers.Monitor(env, 'demo', force=True)

# Play randomly
done = False
env.reset()

step = 0
while not done:
    action = randrange(len(RIGHT_ONLY))
    state, reward, done, info = env.step(action)
    if step > 400:
        env.close()
    print(done, step, info)
    env.render()
    step += 1

env.close()
コード例 #3
0
ファイル: Test_vtrace.py プロジェクト: hybug/test_ppo
class Env(object):
    def __init__(self, act_space, act_repeats, frames, game):
        self.act_space = act_space
        self.act_repeats = act_repeats
        self.act_repeat = random.choice(self.act_repeats)
        self.frames = frames

        self.max_pos = -10000

        self.count = 0

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        s_t = self.resize_image(self.env.reset())

        self.s_t = np.tile(s_t, [1, 1, frames])
        self.s = [self.s_t]

        self.a_t = random.randint(0, act_space - 1)
        self.a = [self.a_t]
        self.a_logits = []
        self.r = []
        self.pos = []

        self.v_cur = []

        c_in = np.zeros(256, dtype=np.float32)
        h_in = np.zeros(256, dtype=np.float32)
        state_in = np.concatenate([c_in, h_in], axis=-1)
        self.state_in = [state_in]

        self.done = False

    def step(self, a, a_logits, state_in):
        self.count += 1
        if self.count % self.act_repeat == 0:
            self.a_t = a
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.a_logits.append(a_logits)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 500) and (
                info["x_pos"] - self.pos[-500] < 5) and (
                self.pos[-500] - info["x_pos"] < 5):
            done = True
        self.done = done

        self.state_in.append(state_in)

    def update_v(self, v_cur):
        self.v_cur.append(v_cur)

    def reset(self, force=False):
        if self.done or force:
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.a_logits = []
            self.r = []
            self.pos = []

            self.v_cur = []

            c_in = np.zeros(256, dtype=np.float32)
            h_in = np.zeros(256, dtype=np.float32)
            state_in = np.concatenate([c_in, h_in], axis=-1)
            self.state_in = [state_in]

            self.done = False

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_max_pos(self):
        return self.max_pos

    def reset_max_pos(self):
        self.max_pos = -10000

    def get_state_in(self):
        return self.state_in[-1]

    def get_history(self, force=False):
        if self.done or force:
            if self.done:
                seg = Seg(self.s, self.a, self.a_logits, self.r, self.v_cur, self.state_in)
                return seg
            if force and len(self.r) > 1:
                seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1], self.r[:-1],
                          self.v_cur[:-1], self.state_in[:-1])
                return seg
        return None

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image)
        image = image / 255.
        image = np.array(image, np.float32)
        return image[:, :, None]
コード例 #4
0
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from numba import cuda  #importing environments
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)  #importing libraries
import numpy as np
observation = env.reset()
r = []
infos = []
MAX_STEPS = 1500
frames = np.zeros((MAX_STEPS, 240, 256, 3), dtype=np.int64)
xs = []
valid_actions = [1, 5, 6]
for step in range(MAX_STEPS):
    # Render into buffer.
    frames[step] = env.render(mode='rgb_array')
    observation, reward, done, info = env.step(
        valid_actions[np.random.randint(3)])  #
    infos.append(info)
    r.append(reward)
    if done:
        break

r = np.array(r)


def preprocess(frame):
    frame = frame.sum(axis=-1) / 765
    frame = frame[20:210, :]
    frame = frame[::2, ::2]
コード例 #5
0
ファイル: Play.py プロジェクト: Pxtri2156/Project_AI
import time
import numpy as np
#from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from agent import DQNAgent
from wrappers import wrapper
from utils import get_args

# Take argument
arg = get_args()

# Build env (first level, right only)
env = gym_super_mario_bros.make(arg.env)
env = JoypadSpace(env, RIGHT_ONLY)
env = wrapper(env)
# Parameters
states = (84, 84, 4)
actions = env.action_space.n

# Pham xuan
# Agent
agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True)

# Episodes
eisodes = 101
rewards = []

# Timing
start = time.time()
コード例 #6
0
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import matplotlib.pyplot as plt
from matplotlib import animation, rc
import numpy as np
from rl.core import Processor

my_action = [
    ['right', 'A'],
    ['right', 'B'],
    ['right', 'A', 'B'],
]

env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

env = JoypadSpace(env, my_action)

nb_actions = 3
window_length = 1
input_shape = (window_length, ) + env.observation_space.shape
print(input_shape)

# ゲーム環境のリセット
env.reset()

from keras.models import Sequential
from keras.layers import *
from keras.initializers import he_normal
model = Sequential()
print('input_shape' + str(input_shape))
model.add(Flatten(input_shape=input_shape))
コード例 #7
0
class Env(object):
    def __init__(self, game, **kwargs):
        self.act_space = kwargs.get("act_space")
        self.state_size = kwargs.get("state_size")
        self.burn_in = kwargs.get("burn_in")
        self.seqlen = kwargs.get("seqlen")
        self.n_step = kwargs.get("n_step")
        self.frames = kwargs.get("frames")

        self.game = game

        self.count = 0
        self.count_maxpos = []

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        self.max_pos = -10000
        self.done = True
        self.reset()

    def step(self, a, a_logits, v_cur, state_in):
        maxpos = self.reset()

        self.count += 1
        self.a_t = a
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        # self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.0
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)
        # self.s.append(self.s_t)
        # self.a.append(self.a_t)
        # self.a_logits.append(a_logits)
        # self.r.append(r_t)
        # self.v_cur.append(v_cur)
        # self.state_in.append(state_in)
        self.s = [self.s_t]
        self.a = [self.a_t]
        self.a_logits = [a_logits]
        self.r = [r_t]
        self.v_cur = [v_cur]
        self.state_in = [state_in]
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        # if (len(self.pos) > 100) and (
        #         info["x_pos"] - self.pos[-100] < 5) and (
        #         self.pos[-100] - info["x_pos"] < 5):
        #     done = True
        self.done = done
        if self.done:
            self.mask.append(0)
        else:
            self.mask.append(1)

        """
        get segs
        """
        # segs = self.get_history()
        #
        # return segs
        return maxpos

    def reset(self):
        if self.done:
            self.count_maxpos.append(self.max_pos)
            print(self.game, self.max_pos, len(self.count_maxpos[1:]), np.mean(self.count_maxpos[1:]))

            self.count = 0

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.a_logits = []
            self.r = [0]
            self.v_cur = []
            self.mask = [1]

            self.max_pos = -10000
            self.pos = []

            state_in = np.zeros(self.state_size, dtype=np.float32)
            self.state_in = [state_in]

            self.done = False
            return self.count_maxpos
        return None

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_reward(self):
        return self.r[-1]

    def get_max_pos(self):
        return self.max_pos

    def get_state_in(self):
        return self.state_in[-1]

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image, np.uint8)
        return image[:, :, None]
コード例 #8
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    agent = DQNAgent(action_size=7)

    scores, episodes, global_step = [], [], 0

    global_start = datetime.now()
    local_start = datetime.now()

    print()
    print("=" * 100)
    print("RL environment initialized")
    print("=" * 100)
    print()

    for e in range(4):
        e = e + 1
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()

        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        state = agent.pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 240, 256, 4))

        count_epsilon = 0
        count_greedy = 0

        while not done:
            # if agent.render:
            env.render()
            global_step += 1
            step += 1
            # 바로 전 4개의 상태로 행동을 선택
            action, res = agent.get_action(history)
            if res:
                count_epsilon += 1
            else:
                count_greedy += 1

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            observe, reward, done, info = env.step(action)
            # print(info)
            # 각 타임스텝마다 상태 전처리
            next_state = agent.pre_processing(observe)
            next_state = np.reshape([next_state], (1, 240, 256, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
            agent.avg_q_max += np.amax(
                agent.model.predict(np.float32(history / 255.))[0])

            real_reward = reward

            score += real_reward

            if dead:
                dead = False
            else:
                history = next_history

            if global_step == 0:
                pass
            elif global_step % 1000 == 0:
                print("local step : {}, time : {} sec, epsilon : {}".format(
                    global_step, (datetime.now() - local_start).seconds,
                    agent.epsilon))
                local_start = datetime.now()

            if done:
                print(
                    "episode : {}, score : {}, step : {}, avg q : {}, avg loss : {}"
                    .format(e, score, agent.epsilon, global_step,
                            agent.avg_q_max / float(step),
                            agent.avg_loss / float(step)))
                print("epsilon : {}, greedy : {}".format(
                    count_epsilon, count_greedy))
                print()

                # if e < 2:
                #     pass
                # else:
                print("time elapsed : {} sec".format(
                    (datetime.now() - global_start).seconds))
                global_start = datetime.now()
                print()
                print()

                agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0
コード例 #9
0
    def load_model(self, name):
        self.actor.load_weights(name)


def pre_processing(next_observe, observe):
    processed_observe = np.maximum(next_observe, observe)
    processed_observe = np.uint8(
        resize(rgb2gray(processed_observe), (240, 256), mode='constant') * 255)
    return processed_observe


if __name__ == "__main__":
    # env = gym.make(env_name)
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    agent = TestAgent(action_size=7)
    agent.load_model("a3c_actor.h5")

    step = 0

    while episode < EPISODES:
        done = False
        dead = False

        score, start_life = 0, 5
        observe = env.reset()
        next_observe = observe

        # for _ in range(random.randint(1, 20)):
        #     observe = next_observe
def evaluate_folder(root_dir, file_name_list, folder_index, param_list, alpha,
                    model_name_list, num_evals):
    # print(model_name_list)
    # print(os.path.join(root_dir, file_name_list[folder_index], param_list[folder_index]))

    json_file_name = os.path.join(root_dir, param_list[folder_index])
    json_file = open(json_file_name)
    json_str = json_file.read()
    hyperparam = json.loads(json_str)

    num_episodes = hyperparam['num_episodes']
    eval_cycle = hyperparam['eval_cycle']
    num_eval_episodes = hyperparam['num_eval_episodes']
    train_every_n_steps = hyperparam['train_every_n_steps']
    train_n_times = hyperparam['train_n_times']
    batch_size = hyperparam['batch_size']
    learning_rate = hyperparam['learning_rate']
    capacity = hyperparam['capacity']
    gamma = hyperparam['gamma']
    epsilon = hyperparam['epsilon']
    tau = hyperparam['tau']
    soft_update = hyperparam['soft_update']
    history_length = hyperparam['history_length']
    skip_frames = hyperparam['skip_frames']
    ddqn = hyperparam['ddqn']
    model = hyperparam['model']
    environment = hyperparam['environment']
    map = hyperparam['map']
    activation = hyperparam['activation']
    render_training = hyperparam['render_training']
    max_timesteps = hyperparam['max_timesteps']
    normalize_images = hyperparam['normalize_images']
    non_uniform_sampling = hyperparam['non_uniform_sampling']
    n_step_reward = hyperparam['n_step_reward']
    mu_intrinsic = hyperparam['mu_intrinsic']
    beta_intrinsic = hyperparam['beta_intrinsic']
    lambda_intrinsic = hyperparam['lambda_intrinsic']
    intrinsic = hyperparam['intrinsic']
    residual_icm_forward = hyperparam['residual_icm_forward']
    use_history_in_icm = hyperparam['use_history_in_icm']
    extrinsic = hyperparam['extrinsic']
    update_q_target = hyperparam['update_q_target']
    epsilon_schedule = hyperparam['epsilon_schedule']
    epsilon_start = hyperparam['epsilon_start']
    epsilon_end = hyperparam['epsilon_end']
    epsilon_decay = hyperparam['epsilon_decay']
    virtual_display = hyperparam['virtual_display']
    seed = hyperparam['seed']
    pre_intrinsic = hyperparam['pre_intrinsic']
    experience_replay = hyperparam['experience_replay']
    prio_er_alpha = hyperparam['prio_er_alpha']
    prio_er_beta_start = hyperparam['prio_er_beta_start']
    prio_er_beta_end = hyperparam['prio_er_beta_end']
    prio_er_beta_decay = hyperparam['prio_er_beta_decay']
    init_prio = hyperparam['init_prio']
    fixed_encoder = hyperparam['fixed_encoder']
    duelling = hyperparam['duelling']
    iqn = hyperparam['iqn']
    iqn_n = hyperparam['iqn_n']
    iqn_np = hyperparam['iqn_np']
    iqn_k = hyperparam['iqn_k']
    iqn_tau_embed_dim = hyperparam['iqn_tau_embed_dim']
    iqn_det_max_train = hyperparam['iqn_det_max_train']
    iqn_det_max_act = hyperparam['iqn_det_max_act']
    huber_kappa = hyperparam['huber_kappa']
    state_height = hyperparam['state_height']
    state_width = hyperparam['state_width']
    number_model_files = hyperparam['number_model_files']
    simple_coverage_threshold = hyperparam['simple_coverage_threshold']
    geometric_coverage_gamma = hyperparam['geometric_coverage_gamma']
    num_total_steps = hyperparam['num_total_steps']
    store_cycle = hyperparam['store_cycle']
    adam_epsilon = hyperparam['adam_epsilon']
    gradient_clip = hyperparam.get('gradient_clip', False)

    # Set seed
    torch.manual_seed(seed)
    # Create experiment directory with run configuration
    args_for_filename = [
        'environment', 'map', 'extrinsic', 'intrinsic', 'fixed_encoder',
        'ddqn', 'duelling', 'iqn', 'experience_replay', 'soft_update',
        'n_step_reward', 'seed'
    ]

    if environment == envs[0]:
        from vizdoom_env.vizdoom_env import DoomEnv
        env = DoomEnv(map_name=map, render=render_training)
        writer = setup_experiment_folder_writer(
            inspect.currentframe(),
            name='Vizdoom',
            log_dir='vizdoom_eval',
            args_for_filename=args_for_filename,
            additional_param=hyperparam)
        # placeholder for non uniform action probabilities. change to something sensible if wanted.
        nu_action_probs = np.ones(env.action_space.n,
                                  dtype=np.float32) / env.action_space.n
    else:
        if virtual_display:
            if render_training:
                print(
                    """On the tfpool computers this will probably not work together.
                    Better deactivate render_training when using the virtual display."""
                )
            from pyvirtualdisplay import Display
            display = Display(visible=0, size=(224, 240))
            display.start()
        if environment == envs[1]:
            from nes_py.wrappers import JoypadSpace
            import gym_super_mario_bros
            from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
            # env = retro.make(game='SuperMarioBros-Nes')
            env = gym_super_mario_bros.make('SuperMarioBros-v0').unwrapped
            env = JoypadSpace(env, COMPLEX_MOVEMENT)
            writer = setup_experiment_folder_writer(
                inspect.currentframe(),
                name='Mario',
                log_dir='mario',
                args_for_filename=args_for_filename)
            nu_action_probs = np.ones(env.action_space.n,
                                      dtype=np.float32) / env.action_space.n
        elif environment == envs[2]:
            import gym_minigrid
            from src.train_gridworld import ClassicalGridworldWrapper
            grid_size = 100
            env = gym_minigrid.envs.EmptyEnv(size=grid_size)
            env = ClassicalGridworldWrapper(env)
            writer = setup_experiment_folder_writer(
                inspect.currentframe(),
                name='GridWorld',
                log_dir='gridworld',
                args_for_filename=args_for_filename)
            nu_action_probs = np.ones(env.action_space.n,
                                      dtype=np.float32) / env.action_space.n
        elif environment == envs[3]:
            import gym
            env = gym.make('Pong-v0')
            writer = setup_experiment_folder_writer(
                inspect.currentframe(),
                name='Pong',
                log_dir='pong',
                args_for_filename=args_for_filename)
            nu_action_probs = np.ones(env.action_space.n,
                                      dtype=np.float32) / env.action_space.n
        else:
            raise NotImplementedError()

    num_actions = env.action_space.n

    channels = 1  # greyscale images
    state_dim = (channels, state_height, state_width
                 )  # not taking history_length into account. handled later.

    # Define Q network, target network and DQN agent
    if model == 'Resnet':
        CNN = ResnetVariant
    elif model == 'Lenet':
        CNN = LeNetVariant
    elif model == 'DeepQNetwork':
        CNN = DeepQNetwork
    else:
        raise ValueError('{} not implemented'.format(model))

    activation = {
        'ReLU': torch.nn.ReLU,
        'ELU': torch.nn.ELU,
        'LeakyReLU': torch.nn.LeakyReLU
    }[activation]

    Q_net = CNN(in_dim=state_dim,
                num_actions=num_actions,
                history_length=history_length,
                duelling=duelling,
                iqn=iqn,
                activation=activation,
                embedding_dim=iqn_tau_embed_dim).to(device)
    Q_target_net = CNN(in_dim=state_dim,
                       num_actions=num_actions,
                       history_length=history_length,
                       duelling=duelling,
                       iqn=iqn,
                       activation=activation,
                       embedding_dim=iqn_tau_embed_dim).to(device)

    state_encoder = Encoder(in_dim=state_dim,
                            history_length=history_length,
                            use_history=use_history_in_icm).to(device)
    # push a dummy input through state_encoder to get output dimension which is needed to build dynamics models.
    tmp_inp = torch.zeros(size=(1, channels *
                                (history_length if use_history_in_icm else 1),
                                state_height, state_width))
    tmp_out = state_encoder(tmp_inp.to(device))
    inverse_dynamics_model = InverseModel(num_actions=num_actions,
                                          input_dim=2 *
                                          tmp_out.shape[1]).to(device)
    forward_dynamics_model = ForwardModel(
        num_actions=num_actions, state_dim=tmp_out.shape[1]).to(device)

    intrinsic_reward_network = IntrinsicRewardGenerator(
        state_encoder=state_encoder,
        inverse_dynamics_model=inverse_dynamics_model,
        forward_dynamics_model=forward_dynamics_model,
        num_actions=num_actions,
        fixed_encoder=fixed_encoder,
        residual_forward=residual_icm_forward,
        use_history=use_history_in_icm)

    agent = DQNAgent(Q=Q_net,
                     Q_target=Q_target_net,
                     intrinsic_reward_generator=intrinsic_reward_network,
                     num_actions=num_actions,
                     gamma=gamma,
                     batch_size=batch_size,
                     tau=tau,
                     epsilon=epsilon,
                     capacity=capacity,
                     train_every_n_steps=train_every_n_steps,
                     history_length=history_length,
                     soft_update=soft_update,
                     ddqn=ddqn,
                     n_step_reward=n_step_reward,
                     train_n_times=train_n_times,
                     non_uniform_sampling=non_uniform_sampling,
                     epsilon_schedule=epsilon_schedule,
                     mu=mu_intrinsic,
                     beta=beta_intrinsic,
                     update_q_target=update_q_target,
                     lambda_intrinsic=lambda_intrinsic,
                     intrinsic=intrinsic,
                     epsilon_start=epsilon_start,
                     epsilon_end=epsilon_end,
                     lr=learning_rate,
                     epsilon_decay=epsilon_decay,
                     extrinsic=extrinsic,
                     pre_intrinsic=pre_intrinsic,
                     experience_replay=experience_replay,
                     prio_er_alpha=prio_er_alpha,
                     huber_kappa=huber_kappa,
                     prio_er_beta_start=prio_er_beta_start,
                     prio_er_beta_end=prio_er_beta_end,
                     init_prio=init_prio,
                     prio_er_beta_decay=prio_er_beta_decay,
                     state_dim=state_dim,
                     iqn=iqn,
                     iqn_n=iqn_n,
                     iqn_np=iqn_np,
                     iqn_k=iqn_k,
                     iqn_det_max_train=iqn_det_max_train,
                     iqn_det_max_act=iqn_det_max_act,
                     nu_action_probs=nu_action_probs,
                     adam_epsilon=adam_epsilon,
                     gradient_clip=gradient_clip)

    eval_offline(env=env,
                 agent=agent,
                 writer=writer,
                 num_episodes=num_episodes,
                 eval_cycle=eval_cycle,
                 num_eval_episodes=num_eval_episodes,
                 soft_update=soft_update,
                 skip_frames=skip_frames,
                 history_length=history_length,
                 rendering=render_training,
                 max_timesteps=max_timesteps,
                 normalize_images=normalize_images,
                 state_dim=state_dim,
                 init_prio=init_prio,
                 num_model_files=number_model_files,
                 simple_coverage_threshold=simple_coverage_threshold,
                 geometric_coverage_gamma=geometric_coverage_gamma,
                 num_total_steps=num_total_steps,
                 store_cycle=store_cycle,
                 model_name_list=model_name_list[folder_index],
                 alpha=alpha,
                 num_evals=num_evals,
                 path_of_run=root_dir)
    writer.close()
コード例 #11
0
import numpy as np
import time
import random
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
from nes_py.wrappers import JoypadSpace
import gym_tetris
from gym_tetris.actions import MOVEMENT

env = gym_tetris.make('TetrisA-v3')
env = JoypadSpace(env, MOVEMENT)
num_bins = 12
MOVELEFT = 6
MOVERIGHT = 3
MOVEDOWN = 9
RENDER = True


P = {
        0: { # I
            0: [(0,0), (1,0), (2,0), (3,0)],
            90: [(0,0), (0,1), (0,2), (0,3)],
            180: [(3,0), (2,0), (1,0), (0,0)],
            270: [(1,3), (1,2), (1,1), (1,0)],
        },
        1: { # T
            0: [(1,0), (0,1), (1,1), (2,1)],
            90: [(0,1), (1,2), (1,1), (1,0)],
            180: [(1,1), (2,0), (1,0), (0,0)],
            270: [(1,1), (0,0), (0,1), (0,2)],
コード例 #12
0
                        action='store_true',
                        help='Store the model')
    parser.add_argument("--save_freq",
                        default=5e4,
                        type=int,
                        help="How often the models' weights are saved")
    args = parser.parse_args()

    # Create a store path for results and debug_summaries
    save_time = dt.datetime.now().strftime("%Y%m%d-%H%M%S")

    reward_writer = tf.summary.create_file_writer('./logs/' + save_time)

    # Initialise the environment
    env = gym_super_mario_bros.make(args.env)
    env = JoypadSpace(env, RIGHT_ONLY)
    env = wrapper(env)

    num_actions = env.action_space.n
    observation_space = args.frame_size
    num_frames = 4

    # Initialise the agent
    kwargs = {
        "observation_space": observation_space,
        "num_actions": num_actions,
        "num_frames": num_frames,
        "delay_timesteps": args.delay_timesteps,
        "beta_decay_iter": args.beta_decay,
        "min_epsilon": args.min_epsilon,
        "epsilon_decay_iter": args.epsilon_decay,
コード例 #13
0
showEnviornment = False

episodeNum = 0

# if gpu is to be used
use_cuda = torch.cuda.is_available()

device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

Tensor = torch.Tensor
LongTensor = torch.LongTensor

env = gym_super_mario_bros.make('SuperMarioBros-v0')
#env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
env = JoypadSpace(env, SIMPLE_MOVEMENT)

#directory = './MarioVideos/'
directory = './MarioVideosLong/'
env = gym.wrappers.Monitor(
    env, directory, video_callable=lambda episode_id: episode_id % 5000 == 0)

seed_value = 23
env.seed(seed_value)
torch.manual_seed(seed_value)
random.seed(seed_value)

###### PARAMS ######
learning_rate = 0.0001
#num_episodes = 5000
num_episodes = 9999999999
コード例 #14
0
def run_agent(agent, rendering=False, monitoring=False, print_reward=False):

    env = gym_super_mario_bros.make("SuperMarioBros-v0")
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env.seed(42)

    if monitoring:
        env = Monitor(env, './video', force=True)
    agent.eval()

    state = env.reset()
    if rendering:
        env.render()

    #Conv2d without flatten()
    state = convert_image(state)  #.flatten()
    state_list = [state, state, state, state]
    position = -1

    global_reward = 0
    s = 0
    for _ in range(10000):
        #Conv2d input
        input = torch.from_numpy(np.array(state_list)).type('torch.FloatTensor')\
            .unsqueeze(0)

        #Linear input
        #input = torch.tensor(state_list).type("torch.FloatTensor").view(1,-1)

        output_probabilities = agent(input).detach().numpy()[0]
        action = np.random.choice(range(action_count), 1, \
            p=output_probabilities).item()
        new_state, reward, done, info = env.step(action)
        global_reward += reward

        s = s + 1
        if rendering:
            env.render()

        state_list.pop()
        #Conv2d without flatten()
        state_list.append(convert_image(new_state))  #.flatten())

        # if mario gets stuck, it gets punished and the loop gets broken
        if position == info["x_pos"]:
            stuck += 1
            if stuck == 100:
                global_reward -= 100
                break
        else:
            stuck = 0

        position = info["x_pos"]
        #env.render()
        #Mario died
        if info["life"] < 2:
            break
    if print_reward:
        print(global_reward)

    return global_reward
コード例 #15
0
argument_parser.add_argument("-b", "--batch-size", type=int, default=64)
argument_parser.add_argument("-l", "--length", type=int, default=None)
argument_parser.add_argument("--enable-cuda", action="store_true")
args = argument_parser.parse_args()

if args.enable_cuda:
    if torch.cuda_is_available():
        device = "cuda"
    else:
        device = "cpu"
        warnings.warn("cuda is ot available. Defaulting to cpu")
else:
    device = "cpu"

env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = MarioEnv(env)
agent = DQN("cnn", env, replay_size=100000, epsilon_decay=100000)
trainer = AdversariaTrainer(
    agent=agent,
    env=env,
    dataset=args.input_path,
    possible_actions=SIMPLE_MOVEMENT,
    device=device,
    length=args.length,
    off_policy=True,
    evaluate_episodes=1,
)
trainer.train(epochs=args.epochs, lr=args.lr, batch_size=args.batch_size)
trainer.evaluate(render=True)
コード例 #16
0
from nes_py.wrappers import JoypadSpace
import gym
from Contra.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY

env = gym.make('Contra-v0')
env = JoypadSpace(env, RIGHT_ONLY)

print("actions", env.action_space)
print("observation_space ", env.observation_space.shape[0])

done = False
a = env.reset()
print("a ", a)
for step in range(5000):
    if done:
        print("Over")
        break
    state, reward, done, info = env.step(env.action_space.sample())
    # print("state ", state)
    # print("reward ", reward)
    # print("Done ", done)
    print("score ", info['score'])
    env.render()

env.close()
コード例 #17
0
ファイル: main.py プロジェクト: sungwah/ME336-Blue-Team-Pro1
import gym
import gym_super_mario_bros
from gym.wrappers import FrameStack, GrayScaleObservation, TransformObservation
from nes_py.wrappers import JoypadSpace

from metrics import MetricLogger
from agent import Mario
from wrappers import ResizeObservation, SkipFrame

# Initialize Super Mario environment
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [['right'], ['right', 'A']])

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

env.reset()

save_dir = Path('checkpoints') / datetime.datetime.now().strftime(
    '%Y-%m-%dT%H-%M-%S')
save_dir.mkdir(parents=True)

checkpoint = None  # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt')
コード例 #18
0
LEARNING_STARTS = 50000
LEARNING_FREQ = 4
TARGER_UPDATE_FREQ = 10000
LEARNING_RATE = 0.00075
ALPHA = 0.95
ALPHA_P = 0.6
EPS = 0.01

env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

env = wrap_deepmind(env)
env = JoypadSpace(env, COMPLEX_MOVEMENT)
expt_dir = 'Game_video'
env = wrappers.Monitor(env, expt_dir, force=True, video_callable=False)

optimizer_spec = OptimizerSpec(
    constructor=optim.RMSprop,
    kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
)

exploration_schedule = LinearSchedule(2000000, 0.05, 0.05)
annelation_schedule = LinearSchedule(2000000, 1.0, 0.4)

dqn_learn(
    env=env,
    q_func=DQN,
    optimizer_spec=optimizer_spec,
コード例 #19
0
ファイル: demo.py プロジェクト: audreyakwenye/TetrisProject
from nes_py.wrappers import JoypadSpace
import gym_tetris
from gym_tetris.actions import MOVEMENT

env = gym_tetris.make('TetrisA-v0')
env = JoypadSpace(env, MOVEMENT)

done = True
for step in range(5000):
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()

env.close()
コード例 #20
0
#         figsize = (15., 5. * len(keys))
#     f, axarr = plt.subplots(len(keys), sharex=True, figsize=figsize)
#     for idx, key in enumerate(keys):
#         axarr[idx].plot(episodes, data[key])
#         axarr[idx].set_ylabel(key)
#     plt.xlabel('episodes')
#     plt.tight_layout()
#     if output is None:
#         plt.show()
#     else:
#         plt.savefig(output)

ENV_NAME = 'CustomContra-v2'
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
env = JoypadSpace(env, CUSTOM_MOVEMENT)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
print(nb_actions)
print(env.observation_space.shape)
obs_dim = env.observation_space.shape[0]

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
コード例 #21
0
                                             len(MOVEMENT)])),
                                  axis=0)
                    highest_objective = info['objective']
            else:
                state, reward, done, info = env.step(0)
            # env.render()


def get_discrete_state(state):
    x_i = state[0] // 16
    y_i = (state[1] - 61) // 16
    return tuple((x_i, y_i))


env = gym_zelda_1.make('Zelda1-v0')
env = JoypadSpace(env, MOVEMENT)

# Create models folder
if not os.path.isdir('Q_tables'):
    os.makedirs('Q_tables')

# The area where Link can be is approximately 255*175 pixels (x:0-255, y:64-239).
# If we divide these dimensions by 16, we get a (16, 11) matrix. This matrix will represent each discrete position Link can be in,
# and for each of these discrete positions, he can perform len(MOVEMENT) distinct actions. Therefore, the Q matrix will have the dimensions [16,11,len(MOVEMENT)].
Q = np.random.uniform(low=-15, high=15, size=([1, 16, 11, len(MOVEMENT)]))

start_in_level_1 = 0
state = env.reset()

LEARNING_RATE = 0.1
DISCOUNT = 0.95
コード例 #22
0
def test_env(env, model, device, deterministic=True):
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = RewardScalar(env)
    env = WarpFrame(env)
    env = FrameStack(env, 4)
    env = StochasticFrameSkip(env, 4, 0.5)
    env = ScaledFloatFrame(env)
    # env=gym.wrappers.Monitor(env, 'recording/PPORB5/{}'.format(str(num)), video_callable=lambda episode_id: True, force=True)
    state = env.reset()
    done = False
    total_reward = 0
    distance = []
    print("yes")
    for i in range(2000):
        state = torch.FloatTensor(state).to(device)
        state = state.float()
        state = state.permute(3, 0, 1, 2)
        dist, _ = model(state)
        policy = dist
        policy = Categorical(F.softmax(policy, dim=-1).data.cpu())
        actionLog = policy.sample()
        action = actionLog.numpy()
        next_state, reward, done, info = env.step(action[0])
        distance.append(info['x_pos'])
        state = next_state
        total_reward += reward
        env.render()

    print(total_reward)
    print(max(distance))
コード例 #23
0
class Game:
    def __init__(self, game_id, obs_size, skip_frame=4, mode='train'):
        self.game_id = game_id
        env = gym_super_mario_bros.make(game_id)
        temp_obs = env.reset()
        height, width, _ = temp_obs.shape
        self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        self.obs_last2max = np.zeros((2, obs_size, obs_size, 1), np.uint8)

        self.obstack = np.zeros((obs_size, obs_size, 4))
        self.rewards = []
        self.lives = 2
        self.skip = skip_frame
        self.mode = mode
        if self.mode == 'play':
            self.monitor = Monitor(width=width, height=height)

    def step(self, action, monitor=False):
        print(self.lives)
        reward = 0.0
        done = False

        for i in range(self.skip):
            obs, r, done, info = self.env.step(action)

            if self.mode == 'play':
                print('Take Action: \t', COMPLEX_MOVEMENT[action])
                self.monitor.record(obs)

            if i >= 2:
                self.obs_last2max[i % 2] = self._process_obs(obs)

            # super mario's reward is cliped in [-15.0, 15.0]
            reward += r / 15.0
            lives = info['life']

            if lives < self.lives:
                print(lives, self.lives)
                done = True
            print(done)
            self.lives = lives

            if done:
                break

        self.rewards.append(reward)

        if done:
            episode_info = {
                "reward": sum(self.rewards),
                "length": len(self.rewards)
            }
            self.reset()
        else:
            episode_info = None

            obs = self.obs_last2max.max(axis=0)

            self.obstack = np.roll(self.obstack, shift=-1, axis=-1)
            self.obstack[..., -1:] = obs

        return self.obstack, reward, done, episode_info

    def reset(self):
        obs = self.env.reset()

        obs = self._process_obs(obs)
        self.obstack[..., 0:] = obs
        self.obstack[..., 1:] = obs
        self.obstack[..., 2:] = obs
        self.obstack[..., 3:] = obs
        self.rewards = []

        self.lives = 2

        return self.obstack

    @staticmethod
    def _process_obs(obs):

        obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        return obs[:, :, None]
コード例 #24
0
from ann_utils.agents.curiosity_driven_agent import Curiosity_AC_GPT_Agent
from ann_utils.models.specialists.vae_transformer_model import GPT2_Curiosity_AC

# auxiliares
from ann_utils.manager import tf_global_initializer, tf_load, tf_save
from ann_utils.sess import TfSess

from tqdm import tqdm

config = tf.ConfigProto(log_device_placement=False)
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.9
sess = TfSess("mario_ac_transformer", gpu=True, config=config)

env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT)

state_info = env.reset()
action_info = env.action_space.sample()
action_size = env.action_space.n
chp = "./saved/mario/"

print('states len {}'.format(state_info.shape))
print('actions len {}'.format(action_size))

size = [192, 192, 3]
bs = 8
state_size = 256
sequence_size = 6

model = GPT2_Curiosity_AC(action_size, 256, 128, 3, 4)
コード例 #25
0
]

global_agent = Curiosity_AC_Context_Agent(model, 1000, bs,
                                          100000).build_agent_brain(
                                              i_s, s_s, act_s, dvc[0], dvc[2],
                                              sess, True, False, True,
                                              'mario_global', False, None)

workers = [
    Curiosity_AC_Context_Agent(model, 1000, bs, 100000).build_agent_brain(
        i_s, s_s, act_s, dvc[0], dvc[2], sess, False, False, False,
        'mario_local_{}'.format(w), True, global_agent.model.variables)
    for w in range(num_worker)
]

genv = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'),
                   SIMPLE_MOVEMENT)
envs = [
    JoypadSpace(gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0'),
                SIMPLE_MOVEMENT) for x in range(num_worker)
]

agents_controller = A3C(genv,
                        envs,
                        global_agent,
                        workers,
                        global_writer,
                        num_local_steps=bs)

tf_global_initializer(sess)

global_writer.add_graph(sess.get_session().graph)
コード例 #26
0
ファイル: Train.py プロジェクト: zachoines/ACIOTResearch
    def __init__(self, config):

        self._config = config

        def get_available_gpus():
            local_device_protos = device_lib.list_local_devices()
            return [
                x.name for x in local_device_protos if x.device_type == 'GPU'
            ]

        print("GPU Available: ", tf.test.is_gpu_available())

        # GPU configuration
        gpus = tf.config.experimental.list_physical_devices('GPU')
        tf.config.threading.set_inter_op_parallelism_threads(0)
        tf.config.threading.set_intra_op_parallelism_threads(0)

        if gpus:

            try:

                # Currently, memory growth needs to be the same across GPUs
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)

            except RuntimeError as e:
                print(e)

        env_names = []
        for _ in range(self._config['Number of worker threads']):
            env_names.append(self._config['Environment Name'])

        # Configuration
        # current_dir = os.getcwd()
        self._model_save_path = '.\Model'
        self._video_save_path = '.\Videos'
        self.record = True

        # Make the super mario gym environments and apply wrappers
        self._envs = []
        collector = Collector()
        collector.set_dimensions(
            ["CMA", "EMA", "SMA", "LENGTH", "LOSS", 'TOTAL_EPISODE_REWARDS'])
        self._plot = AsynchronousPlot(collector, live=False)

        # Apply env wrappers
        counter = 0
        for env_name in env_names:
            env = gym.make(env_name)

            if env_name == 'SuperMarioBros-v0':
                env = JoypadSpace(env, COMPLEX_MOVEMENT)

            # Load wrapper class
            env = Stats(env, collector)
            if self._config['Wrapper class'] != '':
                env = env_wrapper_import(self._config['Wrapper class'], env)

            env = Monitor(env,
                          env.observation_space.shape,
                          savePath=self._video_save_path,
                          record=self.record)

            env = preprocess.GrayScaleImage(
                env, height=84, width=84, grayscale=self._config['Grayscale'])
            env = preprocess.FrameStack(env, 4)

            self._envs.append(env)

        self.NUM_STATE = self._envs[0].observation_space.shape
        self.NUM_ACTIONS = self._envs[0].env.action_space.n
        self.ACTION_SPACE = self._envs[0].env.action_space

        if not os.path.exists(self._video_save_path):
            os.makedirs(self._video_save_path)

        if not os.path.exists(self._model_save_path):
            os.makedirs(self._model_save_path)

        if not os.path.exists('.\stats'):
            os.makedirs('.\stats')
コード例 #27
0
    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]

    def count(self):
        frames = self._force()
        return frames.shape[frames.ndim - 1]

    def frame(self, i):
        return self._force()[..., i]


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT)

env = EpisodicLifeEnv(env)
env = RewardScaler(env)
env = PreprocessFrame(env)
env = StochasticFrameSkip(env, 4, 0.5)
env = ScaledFloatFrame(env)
env = FrameStack(env, 4)


def get_action(state, actions_type="deterministic"):

    if actions_type == "Stochastic":
        action_probability_distribution = (training_model.predict(state))[0]
        top_actions = action_probability_distribution.argsort()[-2:][::-1]
        action = random.choice(top_actions)
コード例 #28
0
import numpy as np
from pathlib import Path
from collections import deque
import random, os, copy, datetime

import gym
from gym.spaces import Box
from gym.wrappers import FrameStack

from nes_py.wrappers import JoypadSpace

import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
env = JoypadSpace(env, [["right"], ["right", "A"]])

env.reset()
next_state, reward, done, info = env.step(action=0)
print(f"{next_state.shape}, \n {reward }, \n {done}, \n {info}")

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
コード例 #29
0
from model import generate_complex_model
import tensorflow as tf
import os
import numpy as np

import time
import random
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

from auto_everything.base import IO
io = IO()

env = gym_super_mario_bros.make('SuperMarioBros-v2')
env = JoypadSpace(env,  SIMPLE_MOVEMENT)
env.reset()

model_file_path = './nn_model.HDF5'
final_model_file_path = './final_nn_model.HDF5'
if os.path.exists(model_file_path):
    model = tf.keras.models.load_model(model_file_path)
else:
    model = generate_complex_model()


def train_once(last_state, history_actions, history_x_pos, history_y_pos, action, reward):
    global model
    model.train_on_batch(
        x={
            'action': np.expand_dims(action, axis=0),
コード例 #30
0
# super mairo 환경을 만들기 위해서는 꼭 gym_super_mario_bros를 import해야합니다.
# gym_super_mario_bros 환경은 256의 모든 NES action space actions를 사용합니다
import gym_super_mario_bros
from gym_super_mario_bros.smb_env import SuperMarioBrosEnv

# NES action에 제약을 걸기 위해서는, gym_super_mario_bros.actions을 사용해야합니다.
# 먼저 ~.actions는 아래의 3가지 action list를 제공합니다.
# RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
# 위 세가지는 nes_py.wrappers.JoypadSpace의 wrapper로 사용됩니다.
# 그 중에서 여기는 SIMPLE_MOVEMENT를 사용하기로 하였습니다.

# super_mario_bros 환경 만들기
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
# action_space 재정의(nes-py)
# env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = JoypadSpace(env, COMPLEX_MOVEMENT)
# observation_space 재정의(gym)
env = PreprocessFrame(env)
# life cycle 재정의
# env = EpisodicLifeEnv(env)
# reward 재정의 필요


episode = 3000
INITIAL_BUFFER_SIZE = 1000  # 초기 inital 값을 버퍼에 채우기 전까지는 학습하지 않습니다.
# 최소의 explore를 하기 위한 value
EPS = 1.00  # 3 프로의 확률로 랜덤
EPS_THRESHOLD = 0.01  # 무한번 반복 시켜도 1%의 최소 확률을 남겨놓음
EPS_DECAY = 0.99  # 0.99를 곱한다.

# keep track of progress