Esempio n. 1
0
class Environment:

    actionMap = {
        0: 'NOOP',
        1: 'Right',
        2: 'Right-Jump',
        3: 'Right-Sprint',
        4: 'Right-Jump-Sprint',
        5: 'Jump',
        6: 'Left'
    }

    def __init__(self, rows=19, columns=16, verbose=True, raw=True, variant=1):
        self.verbose = verbose
        self.raw = raw
        self.variant = variant
        self.img2state = Img2State(rows=19, columns=16)
        self.game = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make('SuperMarioBros-v3'), SIMPLE_MOVEMENT)
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0

        # Actions
        self.A = list(Environment.actionMap.keys())

    def step(self, action: int):
        if action not in self.A:
            raise Exception('Wrong Action...')

        state, self.reward, done, info = self.game.step(action)
        self.state = self.img2state.transfrom(state,
                                              raw=self.raw,
                                              variant=self.variant)

        if done and self.state[8]:
            self.reward = 100
        elif self.state[8]:
            self.reward = 30
        elif self.state[9]:
            self.reward = 15

        if self.verbose:
            self.game.render()

        return done

    def reset(self):
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0
Esempio n. 2
0
def main():
    #env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    timestart = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M:%S')
    # env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50)

    env = VideoRecorderWrapper(env, PROJ_DIR + "/../video/final", str(timestart), 1)
    env = DownsampleEnv(env, (84, 84))
    env = PenalizeDeathEnv(env, penalty=-25)
    env = FrameStackEnv(env, 4)
    # good
    #act = deepq.load(PROJ_DIR+"/../models/mario_model_2018-08-12-13:00:58.pkl")

    # better
    act = deepq.load(PROJ_DIR + "/../models/mario_model_2018-08-12-19:21:50.pkl")
    
    episode = 0
    while True:
        obs, done = env.reset(), False
        stepnr = 0
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])

            if stepnr % 20 == 0:
                plot_obs(obs)

            episode_rew += rew
            stepnr += 1
        print("Episode reward", episode_rew, episode)
        episode = episode+1
Esempio n. 3
0
	def run(self, solution, level, render, mode):
		env = gym_super_mario_bros.make(level)
		env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

		done = True
		reason_finish = "no_more_commands"

		pos = 0
		total_r = 0

		for step in range(len(solution)):
			if done:
				state = env.reset()

			state, reward, done, info = env.step(solution[pos])
			pos+=1


			if reward == -15: #faleceu
				reason_finish = "death"
				break

			if mode == "level" and info['flag_get'] == True:
				reason_finish = "win"
				break

			total_r = total_r + reward
			if render == "true":
				env.render()


		env.close()
		return total_r, pos, info, reason_finish
Esempio n. 4
0
def main(path="./models/deepq/mario_reward_1736.7.pkl"):
    step_mul = 16
    steps = 200

    FLAGS = flags.FLAGS
    flags.DEFINE_string("env", "SuperMarioBros-v0", "RL environment to train.")
    flags.DEFINE_string("algorithm", "deepq", "RL algorithm to use.")

    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    act = deepq.load(path)
    nstack = 4
    nh, nw, nc = env.observation_space.shape
    history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

    obs, done = env.reset(), False
    # history = update_history(history, obs)
    episode_rew = 0
    while not done:
        env.render()
        action = act([obs])[0]
        obs, rew, done, _ = env.step(action)
        # history = update_history(history, obs)
        episode_rew += rew
        print("action : %s reward : %s" % (action, rew))

    print("Episode reward", episode_rew)
class MarioBrosEnvironment(AbstractEnvironment):
    def __init__(self, config):

        self.config = config
        if config.env == 'mario':
            from gym_super_mario_bros.actions import RIGHT_ONLY
            from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
            import gym_super_mario_bros
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, RIGHT_ONLY)
        elif config.env == 'montezuma':
            self.env = gym.make('MontezumaRevengeDeterministic-v0')
        self.state_buffer = []
        self.reward_buffer = []
        self.counter = 0

    def process_image(self, s, subsample=4):
        if self.config.env == 'mario':
            s = skimage.color.rgb2gray(s)
            s = skimage.transform.resize(
                s, (s.shape[0] / subsample, s.shape[1] / subsample),
                anti_aliasing=True,
                mode='constant')
            s = torch.from_numpy(s)
        elif self.config.env == 'montezuma':
            s = s[34:34 + 160, :160]
            s = skimage.color.rgb2gray(s)
            s = skimage.transform.resize(
                s, (s.shape[0] / subsample, s.shape[1] / subsample),
                anti_aliasing=True,
                mode='constant')
            s = torch.from_numpy(s).float()
        return s

    def reset(self):
        self.counter = 0
        self.state_buffer = []
        for _ in range(self.config.n_input_frames):
            state = self.process_image(self.env.reset(),
                                       self.config.image_subsample)
            self.state_buffer.append(state)

        return torch.stack(self.state_buffer)

    def step(self, action):
        total_reward = 0
        for _ in range(self.config.n_action_repeat):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        state = self.process_image(state, self.config.image_subsample)
        self.state_buffer.append(state)
        self.state_buffer = self.state_buffer[-self.config.n_input_frames:]
        return torch.stack(self.state_buffer), total_reward, done, info
Esempio n. 6
0
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)
        self.render = render

        # Make the super mario gym environment and apply wrappers
        self.env = gym.make(ENV)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.env = preprocess.GrayScaleImage(self.env,
                                             height=HIGHT,
                                             width=WIDTH,
                                             grayscale=True)
        # self.env = wrappers.Monitor(self.env, "./Super_Mario_AI/videos", force = True, write_upon_reset=True)
        self.agent = Agent(TEMPERATURE)

    def runEpisode(self):
        s = self.env.reset()
        R = 0
        while True:
            time.sleep(THREAD_DELAY)  # yield

            if self.render: self.env.render()

            a = self.agent.act(s)
            s_, r, done, info = self.env.step(a)

            if done:  # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True
Esempio n. 7
0
class MarioEnv:
    def __init__(self, os='mac', display=False):
        self.display = display
        if os == 'mac' or os == 'linux':
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        else:
            raise Exception("bad os")
        self.act_dim = self.env.action_space.n
        self.obs_dim = (1, 128, 128)
        print("env created with act_dim", self.act_dim, "obs_dim",
              self.obs_dim)
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # chain 2 transforms together using list.
            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
        ])

    def reset(self):
        state = self.env.reset()
        return self.__resize_image(state)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        if reward == 0:
            reward = -0.5
        state_t = self.__resize_image(state)
        return state_t, \
               np.reshape(reward, -1), \
               np.reshape(done, -1)

    def close(self):
        self.env.close()

    def __resize_image(self, state):
        state_new = cv2.resize(state, (128, 128))
        img = Image.fromarray(state_new)
        state_t = self.transform(img)[0, :, :].unsqueeze(0)
        state_t = state_t.float().to(DEVICE)
        return state_t.unsqueeze(0)

    def render(self):
        if self.display:
            self.env.render()
Esempio n. 8
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    done = True
    max_step = 5000
    print(env.observation_space.shape)
    #win下加ascii=True才会不换行
    qbar = tqdm(max_step, ascii=True)
    for step in range(max_step):
        qbar.update()
        if done:
            state = env.reset()
        action = get_action(state, env.action_space)
        state, reward, done, info = env.step(action)
        if done:
            print(str(step) + " 英雄请卷土重来" + str(info))
        env.render()
    env.close()
    qbar.close()
Esempio n. 9
0
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    myRL = DQNAgent(gamma=GAMMA, initial_epsilon=INITIAL_EPSILON,
                    final_epsilon=FINAL_EPSILON,
                    decay_epsilon=DECAY_EPSILON,
                    lr=LEARNING_RATE)

    for episode in range(EPISODES):
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        done = True
        action = 0  # action 'NOOP'

        for itera in range(ITERATION):
            if done:
                env.reset()
                oldObse, _, _, _ = env.step(0)  # get the initial state
                oldObse = myRL.pre_process(oldObse)
                # oldObse = (-1, 'NOOP', 0)

            action = myRL.chooseAction(oldObse)
            
            newObse, reward, done, info = env.step(action)
            newObse = myRL.pre_process(newObse)

            myRL.remember(state=oldObse, action=action, reward=reward, next_state=newObse, done=done)
            oldObse = newObse

            if len(myRL.memory) > BATCH:
                myRL.learn_from_replay(BATCH)
            env.render()
Esempio n. 10
0
class Game:
    def __init__(self, game_id, obs_size, skip_frame=4, mode='train'):
        self.game_id = game_id
        env = gym_super_mario_bros.make(game_id)
        temp_obs = env.reset()
        height, width, _ = temp_obs.shape
        self.env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

        self.obs_last2max = np.zeros((2, obs_size, obs_size, 1), np.uint8)

        self.obstack = np.zeros((obs_size, obs_size, 4))
        self.rewards = []
        self.lives = 3
        self.skip = skip_frame
        self.mode = mode
        if self.mode == 'play':
            self.monitor = Monitor(width=width, height=height)

    def step(self, action, monitor=False):
        reward = 0.0
        done = False

        for i in range(self.skip):
            obs, r, done, info = self.env.step(action)

            if self.mode == 'play':
                self.monitor.record(obs)

            if i >= 2:
                self.obs_last2max[i % 2] = self._process_obs(obs)

            # super mario's reward is cliped in [-15.0, 15.0]
            reward += r / 15.0
            lives = info['life']

            if lives < self.lives:
                done = True

            self.lives = lives

            if done:
                break

        self.rewards.append(reward)

        if done:
            episode_info = {
                "reward": sum(self.rewards),
                "length": len(self.rewards)
            }
            self.reset()
        else:
            episode_info = None

            obs = self.obs_last2max.max(axis=0)

            self.obstack = np.roll(self.obstack, shift=-1, axis=-1)
            self.obstack[..., -1:] = obs

        return self.obstack, reward, done, episode_info

    def reset(self):
        obs = self.env.reset()

        obs = self._process_obs(obs)
        self.obstack[..., 0:] = obs
        self.obstack[..., 1:] = obs
        self.obstack[..., 2:] = obs
        self.obstack[..., 3:] = obs
        self.rewards = []

        self.lives = 3

        return self.obstack

    @staticmethod
    def _process_obs(obs):

        obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        return obs[:, :, None]
Esempio n. 11
0
def replay_genome(genome, movements, gen):
    env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1,
                                                          rom_mode='vanilla')
    env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements)

    print('Number of genes: ', len(genome.connection_genes))
    for gene in genome.connection_genes:
        print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number,
              gene.type, gene.enabled)

    done = True
    unticked = 0
    tick_interval = 1 / 30
    last_tick_time = time.time()

    fps = 0
    frames = 0
    last_fps_time = time.time()

    for _ in range(500000):

        unticked += time.time() - last_tick_time
        last_tick_time = time.time()
        ticked = False

        # while unticked >= tick_interval:
        if done:
            state = env.reset()

        state_downscaled = get_sensor_map(env_expanded)

        action = genome.calculate_action(state_downscaled)

        # print('\rFPS: {:.3f}'.format(fps), end=' ')
        # print(vectofixedstr(action, 10), end=' ')
        action = np.argmax(action)
        print('\rtaking action', movements[action], end='', flush=True)

        state, reward, done, info = env.step(action)

        #filename = get_path_of('all_pictures/mario/')
        #imsave(filename + 'mario_' + str(_) + '.png', state)

        save_state = np.full((13, 10, 3), 255, dtype=np.int)

        COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]]

        for i in range(13):
            for j in range(10):
                if state_downscaled[(i, j)] == -1:
                    save_state[(i, j)] = COLORS[3]
                elif state_downscaled[(i, j)] == 0:
                    save_state[(i, j)] = COLORS[0]
                else:
                    save_state[(i, j)] = COLORS[1]

        save_state[(7, 2)] = COLORS[2]

        # filename = get_path_of('all_pictures/input_downscaled/')
        # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8))

        # make_controller(movements[action], _, gen)

        env.render()

        if info["life"] <= 2:
            died = True
            break

        ticked = True
        frames += 1
        unticked -= tick_interval

        # if ticked:
        #     now = time.time()
        #     if now - last_fps_time >= 1:
        #         fps = frames / (now - last_fps_time)
        #         last_fps_time = now
        #         frames = 0
        # else:
        #     time.sleep(0.001)

    env.close()
Esempio n. 12
0
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    print(SIMPLE_MOVEMENT, env.action_space.n)
    # get size of state and action from environment
    state_size = env.observation_space.shape
    action_size = env.action_space.n
    memorydb_instance = MemoryDB(e, a, beta, beta_increment_per_sampling,
                                 capacity, max_priority)
    agent_instance = DQNAgent(input_size, action_size, esplison,
                              esplison_decay, True)
    state_generator_instance = StateGenerator(frame_size, stack_size)
    scores, episodes = [], []

    for e in range(max_episodes):
        done = False
        score = 0
        raw_state = env.reset()
        state = state_generator_instance.get_stacked_frames(raw_state, True)

        steps = 0  # up to 500

        while not done and steps < max_steps:
            if render:  # if True
                env.render()
            steps += 1
            # get e greedy action
            action = agent_instance.get_action(np.array([state]))
            raw_state, reward, done, info = env.step(action)
            frame = 1
            while frame < stack_size and not done:
                raw_state, reward, done, info = env.step(action)
                frame += 1
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=False,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done
        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            # 4 frame skip
            reward = 0.0
            done = None
            for i in range(4):
                obs, r, done, info = self.env.step(action)
                if self.is_render:
                    self.env.render()
                reward += r
                if done:
                    break

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = int(info.get('flag_get', False))

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Esempio n. 14
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15

    epsilon = 0.0

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))

    max_steps = 5000
    num_eps = 1

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            state = next_state

            env.render()
            time.sleep(0.03)

            if done:
                break

    env.close()
Esempio n. 15
0
        save_dir = save_dir.resolve()
    except FileNotFoundError:
        save_dir.mkdir(parents=True)
        save_dir = save_dir.resolve()
        print("Made save path at: {}".format(save_dir))
    save_path = save_dir / AGENT_FILENAME

    if Path.is_file(save_path):
        print("Loading saved agent...")
        agent.load(save_path)

    done = False
    batch_size = 32

    for e in range(1, EPISODES + 1):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        time = 0
        while True:
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done or time >= 500:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, time, agent.epsilon))
                break
            time += 1
Esempio n. 16
0
class MarioEnvironment(AbstractEnvironment):
    """
    Standard Super Mario Bros Environment
    https://github.com/Kautenja/gym-super-mario-bros
    """
    def __init__(self,
                 game_name,
                 task_name,
                 action_mode=SIMPLE_MOVEMENT,
                 state_size=None):
        """
        Args:
        game_name : string
        game_name = name of the game (e.g. SuperMarioBros-5-1-v0)
        task_name : string
        task_name = name of the task
        state_size : list or tuple or None
        state_size = size of state, [h, w] or [h, w, c]
        """
        self.game_name = game_name
        self.task_name = task_name
        self.action_mode = action_mode
        self.env = gym_super_mario_bros.make(game_name)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, self.action_mode)
        self.n_action = self.env.action_space.n
        self.actions = [a for a in range(self.n_action)]
        self.new_episode()

    def get_state(self, setting=None):
        """
        Get Current State
        Args:
        setting : dictionary
        setting = setting for states
            'resolution' : list or tuple or None
            'resolution' = resolution of states, [h, w, c] or [h, w]
        Returns:
        state : numpy.ndarray
        state = current screen, shape [h, w, c], values locate at [0, 1]
        """
        if (setting is None or ('resolution' not in setting.keys())):
            resolution = self.state_size
        else:
            resolution = setting['resolution']
        normalized = False
        if (len(resolution) == 3 and resolution[2] == 1):
            state = rgb2grey(self.ob)
            normalized = True
        else:
            state = self.ob
        if (state.ndim == 2):
            state = np.expand_dims(state, axis=-1)
        assert (state.ndim == 3), 'shape of screen should be [h, w, c]'
        state = resize(state, resolution[:2], preserve_range=True)
        state = state.astype(np.float)
        if (not normalized):
            state /= 255.
        return state

    def apply_action(self, action, num_repeat):
        """
        Apply Actions To The Environment And Get Reward
        Args:
        action : int
        action = applied action
        num_repeat : int
        num_repeat = number of repeated actions
        Returns:
        reward : float
        reward = reward of last action
        """
        assert (not self.done), 'The episode is done'
        reward = 0
        for _ in range(num_repeat):
            self.ob, reward, self.done, _ = self.env.step(action)
            self.score += reward
            if (self.done):
                break
        reward = reward_reshape(reward, self.game_name, self.task_name)
        return reward

    def new_episode(self):
        """
        Start A New Episode
        """
        self.ob = self.env.reset()
        self.done = False
        self.score = 0

    def episode_end(self):
        """
        Check If The Episode Ends
        Returns:
        ep_end : bool
        ep_end = when the episode finishes, return True
        """
        return self.done

    def action_set(self):
        """
        Get Actions Set
        Returns:
        actions : list
        actions = list of actions
        """
        return self.actions

    def available_action(self):
        """
        Get Indices of Available Actions For Current State
        Returns:
        available_ind : list
        available_ind = indices of available action
        """
        return range(self.actions)

    def episode_total_score(self):
        """
        Get Total Score For Last Episode
        """
        return self.score

    def close(self):
        """
        Close The Environment
        """
        self.env.close()
        return True
Esempio n. 17
0
class MoMarioEnv(Process):
    def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84):
        super(MoMarioEnv, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT)

        self.is_render = args.render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.n_mo = 5
        self.morall = np.zeros(self.n_mo)
        self.recent_rlist = deque(maxlen=100)
        self.recent_morlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.life_done = args.life_done
        self.single_stage = args.single_stage
        self.stage_bonus = 0

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MoMarioEnv, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if self.single_stage and info["flag_get"]:
                self.stage_bonus = 10000
                done = True
            ''' Construct Multi-Objective Reward'''  #####################################
            # [x_pos, time, death, coin]
            moreward = []
            # 1. x position
            xpos_r = info["x_pos"] - self.x_pos
            self.x_pos = info["x_pos"]
            # resolve an issue where after death the x position resets
            if xpos_r < -5:
                xpos_r = 0
            moreward.append(xpos_r)

            # 2. time penaltiy
            time_r = info["time"] - self.time
            self.time = info["time"]
            # time is aways decreasing
            if time_r > 0:
                time_r = 0
            moreward.append(time_r)

            # 3. death
            if self.lives > info['life']:
                death_r = -25
            else:
                death_r = 0
            moreward.append(death_r)

            # 4. coin
            coin_r = (info['coins'] - self.coin) * 100
            self.coin = info['coins']
            moreward.append(coin_r)

            # 5. enemy
            enemy_r = info['score'] - self.score
            if coin_r > 0 or done:
                enemy_r = 0
            self.score = info['score']
            moreward.append(enemy_r)

            ############################################################################

            if self.life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            r = reward / 15
            self.rall += reward

            self.morall += np.array(moreward)
            mor = np.array(moreward) * self.n_mo / 15

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            score = info['score'] + self.stage_bonus

            if done:
                self.recent_rlist.append(self.rall)
                self.recent_morlist.append(self.morall)
                print(
                    "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}"
                    .format(self.episode, self.env_idx, self.steps,
                            score, self.morall,
                            np.mean(self.recent_morlist,
                                    axis=0), info['coins'], info['x_pos']))

                self.history = self.reset()

            self.child_conn.send(
                [self.history[:, :, :], r, force_done, done, mor, score])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.stage_bonus = 0
        self.morall = np.zeros(self.n_mo)
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Esempio n. 18
0
        alldiscrewards.append(discountrewards(rewards))
    fullrewards = numpy.concatenate(alldiscrewards)
    rmean = fullrewards.mean()
    rstd = fullrewards.std()
    return [(discrewards - rmean) / rstd for discrewards in alldiscrewards]


# In[2]:

## Base model to run the game, using random movements
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
env.reset()

# In[3]:

#from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
#import gym_super_mario_bros
#from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
#env = gym_super_mario_bros.make('SuperMarioBros-v0')
#env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
#

# In[4]:

import numpy as np
import skimage
Esempio n. 19
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    movement.append(['B'])
    movement.append(['down'])
    movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    width = 84
    resize_height = 110
    final_height = 84
    size = [channels, final_height, width]

    batch_size = 32
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'
    epsilon = 1
    gamma = 0.9

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 5000
    num_eps = 1000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width],
                                    final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, device, gamma))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if done:
                with open(total_reward_file, 'a') as f:
                    f.write('{}\t{}\n'.format(episode_reward, step))

                break

        epsilon -= (1 / num_eps)
        if episode % 10 == 0:
            target_model.load_state_dict(model.state_dict())

            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
Esempio n. 20
0
def train(num_episodes, episode_length, learning_rate , scenario = "deatmatch.cfg", map_path = 'map02', render= True):
        #discount factor
        discount_factor = 0.99
        # 버퍼에 익스피리언스를 업데이트 하는 주기
        learning_rate = 0.01
        update_frequency = 5
        store_frequency  = 50

        #아웃풋을 프린팅하는 주기
        print_frequency = 1000

        #total reward와 total loss를 저장할 변수를 초기화

        total_reward = 0
        total_loss = 0
        old_q_value = 0

        # episodic reward와 loss를 저장할 리스트를 초기화
        rewards = []
        losses = []


        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

        env.reset()
        actionDRQN = DRQN((240, 256, 3), 11,learning_rate)
        targetDRQN = DRQN((240, 256, 3), 11,learning_rate)


        #experience buffer cell_size
        experiences = ExperienceReplay(1000000)
        # 모델 저장

        saver = tf.train.Saver({v.name: v for v in actionDRQN.parameters}, max_to_keep = 1)


        #학습을 시작해보자
        #샘플링을 위해 모든 변수를 초기화 시킨다. 그리고 버퍼에서 트렌지션을 storing한다.
        sample = 10
        store = 100

        with tf.Session() as sess:

            #모든 텐서플로우 변수를 초기화 한다.
            sess.run(tf.global_variables_initializer())
            for episode in range(num_episodes):
                #새로운 에피소드를 시작한다.
                env.reset()

                for frame in range(episode_length):
                    env.render()
                    state = env.observation_space.shape
                    print(state)
                    action = actionDRQN.prediction.eval(feed_dict = {actionDRQN.input: state})
                    #env.step (action을 통하)
                    next_state, reward, done, info = env.step(action)
                    #reward를 업데이트
                    total_reward += reward

                    state= next_state
                    #game이 끝나면 break한다.
                    if done:
                        break
                    #transition을 버퍼에 넣는다.
                    if (frame%store)==0:
                        experience.appendToBuffer((s,action,reward))

                    #buffer에서 샘플을 뽑는다.
                    if (frame%sample) == 0:
                        memory = experiences.sample(1)
                        mem_frame=memory[0][0]
                        mem_reward = memory[0][2]

                        #train

                        Q1 = actionDRQN.output.eval(feed_dict = {actionDRQN.input : state})
                        Q2 = targetDRQN.output.eval(feed_dict = {targetDRQN.input : mem_frame})

                        #learning rate

                        learning_rate = actionDRQN.learning_rate.eval()

                        #Q value를 계산한다.
                        Qtarget = old_q_value + learning+_rate * (mem_reward + discount_factor * Q2 - old_q_value)

                        #update

                        old_q_value = Qtarget

                        # loss 계산
                        loss =actionDRQN.loss.eval(feed_dict = {actionDRQN.target_vector : Qtarget, actionDRQN.input : mem_frame})

                        p
                        #update loss
                        total_loss += loss

                        # 두 네트워크를 업데이트한다.

                        actionDRQN.update.run(feed_dict = {actionDRQN.target+vector : Qtarget, actionDRQN.input : mem_frame})
                        targetDRQN.update.run(feed_dict = {targetDRQN.target+vector : Qtarget, targetDRQN.input : mem_frame})
                        rewards.append((episode, total_reward))
                        losses.append((episode, total_loss))

                        total_reward = 0
                        total_loss = 0
Esempio n. 21
0
class Agent:
    def __init__(self, level_name):
        self.level_name = level_name
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(
            np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()

        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)

        # instantiate memory
        self.memory = Memory(max_size=memory_size)

        # initialize deque with zero images
        self.stacked_frames = deque(
            [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)],
            maxlen=4)

        for i in range(pretrain_length):
            # If it's the first step
            if i == 0:
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(
                self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state

        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)

        self.write_op = tf.summary.merge_all()

    def predict_action(self, sess, explore_start, explore_stop, decay_rate,
                       decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (
            explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output,
                          feed_dict={
                              self.DQNetwork.inputs_:
                              state.reshape((1, *state.shape))
                          })

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability

    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(),
                                           animate,
                                           frames=len(frames),
                                           interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode='rgb_array'))

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)

    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()

    def train(self):
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(
                        sess, explore_start, explore_stop, decay_rate,
                        decay_step, state, self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, "Total reward:",
                              total_reward, "Explore P:", explore_probability,
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(
                            self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add(
                            (state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch],
                                              ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(
                        self.DQNetwork.output,
                        feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(
                            Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run(
                        [self.DQNetwork.loss, self.DQNetwork.optimizer],
                        feed_dict={
                            self.DQNetwork.inputs_: states_mb,
                            self.DQNetwork.target_Q: targets_mb,
                            self.DQNetwork.actions_: actions_mb
                        })

                    # write tf summaries
                    summary = sess.run(self.write_op,
                                       feed_dict={
                                           self.DQNetwork.inputs_: states_mb,
                                           self.DQNetwork.target_Q: targets_mb,
                                           self.DQNetwork.actions_: actions_mb
                                       })
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess,
                                    "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
Esempio n. 22
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 s_dim,
                 a_dim,
                 g_net,
                 g_opt,
                 update_iter=10,
                 is_render=False,
                 use_cuda=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        # self.n_step = n_step
        self.update_iter = update_iter
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = []

        self.use_cuda = use_cuda
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.s_dim = s_dim
        self.a_dim = a_dim
        self.g_net = g_net
        self.g_opt = g_opt

        self.buffer_state = []
        self.buffer_action = []
        self.buffer_reward = []

    def run(self):
        super(MarioEnv, self).run()

        self.model = A3C(
            self.s_dim,
            self.a_dim,
            gamma=0.95,
            epsilon_start=1.0,
            epsilon_end=0.1,
            epsilon_length=100000,
            use_cuda=self.use_cuda,
        )
        self.model.l_net.load_state_dict(self.g_net.state_dict())

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        while True:
            if len(self.transition) != 4:
                action = self.model.get_action(self.transition, is_random=True)
            else:
                action = self.model.get_action(self.transition,
                                               is_random=False)

            next_state, reward, done, info = self.env.step(action)
            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            self.buffer_state.append(self.transition)
            self.buffer_action.append(action)
            self.buffer_reward.append(reward)

            if len(self.buffer_state
                   ) > 0 and self.steps % self.update_iter == 0:
                next_transition = self.transition[1:]
                next_transition.append(next_state)

                self.train(next_transition, done)

                self.buffer_state.clear()
                self.buffer_action.clear()
                self.buffer_reward.clear()

            # make a transition
            self.transition.append(next_state)
            if len(self.transition) > 4:
                self.transition.pop(0)

            if done:
                self.send_result(info['x_pos'])
                self.reset()

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)
        self.transition.clear()
        self.transition.append(state)

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])

    def train(self, next_transition, done):
        if done:
            v_s_ = 0.
        else:
            _, v = self.model.l_net.forward(
                torch.Tensor([next_transition]).to(self.device))
            v_s_ = v.cpu().detach().numpy()[0][0]

        prob, v = self.model.l_net.forward(
            torch.Tensor(self.buffer_state).to(self.device))

        buffer_v_target = []
        for r in self.buffer_reward[::-1]:
            v_s_ = r + self.model.gamma * v_s_
            buffer_v_target.append(v_s_)
        buffer_v_target.reverse()
        buffer_v_target = torch.Tensor(np.array(buffer_v_target)).to(
            self.device)
        buffer_action = torch.Tensor(np.array(self.buffer_action)).to(
            self.device)

        # LOSS 함수 구성
        td_error = buffer_v_target - v
        loss_critic = td_error.pow(2)

        dist = torch.distributions.Categorical(prob)
        loss_actor = -dist.log_prob(buffer_action) * td_error.detach()

        loss = (loss_critic + loss_actor).mean()

        self.g_opt.zero_grad()
        loss.backward()
        for lp, gp in zip(self.model.l_net.parameters(),
                          self.g_net.parameters()):
            gp._grad = lp.grad.clone().cpu()
        self.g_opt.step()

        self.model.l_net.load_state_dict(self.g_net.state_dict())
Esempio n. 23
0
class MarioEnvironment(Process):
    def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=4,
                 h=84,
                 w=84):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), movement)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}"
                    .format(self.episode, self.env_idx, self.steps, self.rall,
                            np.mean(self.recent_rlist), info['stage'],
                            info['x_pos'], self.max_pos))

                self.history = self.reset()
            else:
                self.child_conn.send(
                    [self.history[:, :, :], r, False, done, log_reward])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Esempio n. 24
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    # width = 84
    # resize_height = 110
    # final_height = 84
    width=128
    resize_height = 168
    final_height = 128
    size = [channels, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'

    gamma = 0.95

    start_epsilon = 0.3
    stop_epsilon = 0.01
    epsilon_decay = 0.00025

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    data_file = 'data_loader'
    model_file = 'mario_agent'
    continue_train = True
    model.load_state_dict(torch.load(model_file))

    if continue_train:
        target_model.load_state_dict(torch.load(model_file))

    lr = 0.00005
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file ='total_reward.txt'


    if not continue_train:
        with open(total_reward_file, 'w') as f:
            f.write('Reward\tSteps\n')


    max_steps = 5000
    num_eps = 5000

    if continue_train:
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            data.batch_size = batch_size
    else:
        data = dataset(replay_capacity, batch_size, replay_dir, size)

        #initialize memory with 100 experiences
        done = True
        for i in range(100):
            if done:
                state = env.reset()
                state = preprocess(state, [resize_height, width], final_height)
                state = torch.cat((state, state, state, state))

            action = random.randint(0,len(movement)-1)
            next_state, reward, done, info = env.step(int(action))

            # if reward>0:
            #     reward = 1
            # else:
            #     reward = -1
            reward /= 15
            if reward == 0:
                reward = -0.1

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)

            state = next_state


    tau = 0
    max_tau = 2000
    decay_step = 0
    farthest = 3000
    cur_x = 1

    #training loop
    for episode in range(num_eps):
        print('Episode {}'.format(episode+1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            tau += 1


            #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step)
            epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest)))
            if epsilon < stop_epsilon:
                epsilon = stop_epsilon

            if random.random() < epsilon:
                action = random.randint(0,len(movement)-1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            cur_x = info['x_pos']

            if cur_x > farthest:
                farthest = cur_x

            # if reward > 0:
            #     reward = 1
            # else:
            #     reward = -1

            reward /= 15
            if reward == 0:
                reward = -0.1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:,:,:], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            batch = data.get_batch(model, target_model, device, gamma)
            loss, abs_err = train(model, device, optimizer, batch)

            data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err)))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        decay_step += step
        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(data_file, 'wb') as f:
                pickle.dump(data, f)


    env.close()
Esempio n. 25
0
    ['A'],
    ['B'],
    ['right'],
    ['right', 'A'],
    ['right', 'B'],
    ['right', 'A', 'B'],
    ['left'],
    ['left', 'A'],
    ['left', 'B'],
    ['left', 'A', 'B'],
    #    ['down'],
    #    ['up']
]

_env = gym_super_mario_bros.make('SuperMarioBros-v0')
#_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle')
env = BinarySpaceToDiscreteSpaceEnv(_env, movements)
env = DummyVecEnv([lambda: env])
model = PPO2(policy=CnnPolicy, env=env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()

while True:
    action, _info = model.predict(obs)

    obs, rewards, dones, info = env.step(action)
    print("학습끝")
    print(rewards)
    env.render()
Esempio n. 26
0
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3')
        env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT)
        step = 0

        while episode < EPISODES:
            done = False

            max_x = 40
            no_progress = 0
            score = 0
            state = env.reset()

            # Making initial history with random actions
            for _ in range(5):
                next_state = state
                state, _, _, _ = env.step(0)

            state = preprocess(state)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 88, 128, 4))

            while not done:
                # Rendering code
                # Seems to be causing error in Mac OS
                #if self.thread_num==1:
                #    env.render()
                step += 1
                self.t += 1

                step_reward = 0

                action, policy = self.get_action(history)

                # Taking 3 steps with selected action
                # Mimicking frame skip
                for _ in range(6):
                    next_state, reward, done, info = env.step(action)
                    score += reward
                    step_reward += reward
                    if done:
                        break

                # Kill Mario if Mario is making no progress for 10 seconds
                x_now = info.get('x_pos')
                # Handling exception x_pos = 65535
                if x_now == 65535:
                    x_now = max_x
                if max_x < x_now:
                    max_x = x_now
                    no_progress = 0
                else:
                    no_progress += 1
                if no_progress == 150:
                    done = True
                    #reward -= 1
                    step_reward -= 1
                    score -= 1
                    print("#", self.thread_num, " STUCK")

                # Preprocessing each states
                next_state = preprocess(next_state)
                next_state = np.reshape([next_state], (1, 88, 128, 1))
                next_history = np.append(next_state,
                                         history[:, :, :, :3],
                                         axis=3)

                # Average policy max value
                self.avg_p_max += np.amax(
                    self.actor.predict(np.float32(history / 255.)))

                # Appending sample
                self.append_sample(history, action, step_reward)
                history = next_history
                if self.t >= self.t_max or done:
                    #if done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # Recording training information

                    episode += 1
                    print("#", self.thread_num,
                          "  episode:", episode, "  score:",
                          format(score,
                                 '.2f'), "  step:", step, "max_x :", max_x)

                    stats = [score, self.avg_p_max / float(step), step]
                    for i in range(len(stats)):
                        self.sess.run(self.update_ops[i],
                                      feed_dict={
                                          self.summary_placeholders[i]:
                                          float(stats[i])
                                      })
                    summary_str = self.sess.run(self.summary_op)
                    self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
Esempio n. 27
0
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))


BATCH_SIZE = 16
GAMMA = 0.01
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 5

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
done = True
picture=env.reset()
init_screen = picture
screen_height, screen_width, _ = init_screen.shape

policy_net = DQN(screen_height, screen_width).to(device)
target_net = DQN(screen_height, screen_width).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(2000)

steps_done = 0
steps_list={}
def select_action(state,x):
    global steps_list
Esempio n. 28
0
Created on Sun Mar 10 21:00:57 2019

@author: tawehbeysolow
"""

import numpy as np
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from algorithms.actor_critic_utilities import train_model
from neural_networks.models import ActorCriticModel

#Parameters
environment = gym_super_mario_bros.make('SuperMarioBros-v0')
environment = BinarySpaceToDiscreteSpaceEnv(environment, SIMPLE_MOVEMENT)
observation = environment.reset()
learning_rate = 1e-4
gamma = 0.96
epsilon = 0.9
n_episodes = 10000
n_steps = 2048
max_steps = int(1e7)
_lambda = 0.95
value_coefficient = 0.5
entropy_coefficient = 0.01
max_grad_norm = 0.5
log_interval = 10


def play_super_mario(model, environment=environment):
Esempio n. 29
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 n_step,
                 is_render=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        self.n_step = n_step
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = []

    def run(self):
        super(MarioEnv, self).run()

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        self.request_action(0, False)

        while True:
            action = self.child_conn.recv()
            next_state, reward, done, info = self.env.step(action)
            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            # make a transition
            self.transition.append(next_state)
            if len(self.transition) > 4:
                self.transition.pop(0)

            if done:
                self.send_result(info['x_pos'])
                self.reset()
                self.request_action(reward, True)
            else:
                self.request_action(reward, False)

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)
        self.transition.clear()
        self.transition.append(state)

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def request_action(self, reward, done):
        self.queue.put([self.idx, "OnStep", [self.transition, reward, done]])

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])
Esempio n. 30
0
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
done = True

RENDER_ENV = False

EPISODES = 5000
rewards = []
PG = PolicyGradient(n_x=env.observation_space.shape[0],
                    n_y=env.action_space.n,
                    learning_rate=0.01,
                    reward_decay=0.99)

for episodes in range(EPISODES):

    observation = env.reset()

    observation = np.array(observation).reshape(1, 240, 256, 3)

    episode_reward = 0
    print("episode", episodes)

    while True:

        if RENDER_ENV: env.render()

        action = PG.choose_action(observation)

        next_state, reward, done, info = env.step(action)

        PG.store_transition(next_state, action, reward)