Beispiel #1
0
def main():
    #env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    timestart = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M:%S')
    # env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50)

    env = VideoRecorderWrapper(env, PROJ_DIR + "/../video/final", str(timestart), 1)
    env = DownsampleEnv(env, (84, 84))
    env = PenalizeDeathEnv(env, penalty=-25)
    env = FrameStackEnv(env, 4)
    # good
    #act = deepq.load(PROJ_DIR+"/../models/mario_model_2018-08-12-13:00:58.pkl")

    # better
    act = deepq.load(PROJ_DIR + "/../models/mario_model_2018-08-12-19:21:50.pkl")
    
    episode = 0
    while True:
        obs, done = env.reset(), False
        stepnr = 0
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])

            if stepnr % 20 == 0:
                plot_obs(obs)

            episode_rew += rew
            stepnr += 1
        print("Episode reward", episode_rew, episode)
        episode = episode+1
Beispiel #2
0
	def run(self, solution, level, render, mode):
		env = gym_super_mario_bros.make(level)
		env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

		done = True
		reason_finish = "no_more_commands"

		pos = 0
		total_r = 0

		for step in range(len(solution)):
			if done:
				state = env.reset()

			state, reward, done, info = env.step(solution[pos])
			pos+=1


			if reward == -15: #faleceu
				reason_finish = "death"
				break

			if mode == "level" and info['flag_get'] == True:
				reason_finish = "win"
				break

			total_r = total_r + reward
			if render == "true":
				env.render()


		env.close()
		return total_r, pos, info, reason_finish
Beispiel #3
0
def main(path="./models/deepq/mario_reward_1736.7.pkl"):
    step_mul = 16
    steps = 200

    FLAGS = flags.FLAGS
    flags.DEFINE_string("env", "SuperMarioBros-v0", "RL environment to train.")
    flags.DEFINE_string("algorithm", "deepq", "RL algorithm to use.")

    FLAGS(sys.argv)
    # 1. Create gym environment
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    act = deepq.load(path)
    nstack = 4
    nh, nw, nc = env.observation_space.shape
    history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8)

    obs, done = env.reset(), False
    # history = update_history(history, obs)
    episode_rew = 0
    while not done:
        env.render()
        action = act([obs])[0]
        obs, rew, done, _ = env.step(action)
        # history = update_history(history, obs)
        episode_rew += rew
        print("action : %s reward : %s" % (action, rew))

    print("Episode reward", episode_rew)
class MarioBrosEnvironment(AbstractEnvironment):
    def __init__(self, config):

        self.config = config
        if config.env == 'mario':
            from gym_super_mario_bros.actions import RIGHT_ONLY
            from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
            import gym_super_mario_bros
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, RIGHT_ONLY)
        elif config.env == 'montezuma':
            self.env = gym.make('MontezumaRevengeDeterministic-v0')
        self.state_buffer = []
        self.reward_buffer = []
        self.counter = 0

    def process_image(self, s, subsample=4):
        if self.config.env == 'mario':
            s = skimage.color.rgb2gray(s)
            s = skimage.transform.resize(
                s, (s.shape[0] / subsample, s.shape[1] / subsample),
                anti_aliasing=True,
                mode='constant')
            s = torch.from_numpy(s)
        elif self.config.env == 'montezuma':
            s = s[34:34 + 160, :160]
            s = skimage.color.rgb2gray(s)
            s = skimage.transform.resize(
                s, (s.shape[0] / subsample, s.shape[1] / subsample),
                anti_aliasing=True,
                mode='constant')
            s = torch.from_numpy(s).float()
        return s

    def reset(self):
        self.counter = 0
        self.state_buffer = []
        for _ in range(self.config.n_input_frames):
            state = self.process_image(self.env.reset(),
                                       self.config.image_subsample)
            self.state_buffer.append(state)

        return torch.stack(self.state_buffer)

    def step(self, action):
        total_reward = 0
        for _ in range(self.config.n_action_repeat):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        state = self.process_image(state, self.config.image_subsample)
        self.state_buffer.append(state)
        self.state_buffer = self.state_buffer[-self.config.n_input_frames:]
        return torch.stack(self.state_buffer), total_reward, done, info
Beispiel #5
0
class Environment:

    actionMap = {
        0: 'NOOP',
        1: 'Right',
        2: 'Right-Jump',
        3: 'Right-Sprint',
        4: 'Right-Jump-Sprint',
        5: 'Jump',
        6: 'Left'
    }

    def __init__(self, rows=19, columns=16, verbose=True, raw=True, variant=1):
        self.verbose = verbose
        self.raw = raw
        self.variant = variant
        self.img2state = Img2State(rows=19, columns=16)
        self.game = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make('SuperMarioBros-v3'), SIMPLE_MOVEMENT)
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0

        # Actions
        self.A = list(Environment.actionMap.keys())

    def step(self, action: int):
        if action not in self.A:
            raise Exception('Wrong Action...')

        state, self.reward, done, info = self.game.step(action)
        self.state = self.img2state.transfrom(state,
                                              raw=self.raw,
                                              variant=self.variant)

        if done and self.state[8]:
            self.reward = 100
        elif self.state[8]:
            self.reward = 30
        elif self.state[9]:
            self.reward = 15

        if self.verbose:
            self.game.render()

        return done

    def reset(self):
        self.state = self.img2state.transfrom(self.game.reset(),
                                              raw=self.raw,
                                              variant=self.variant)
        self.reward = 0
Beispiel #6
0
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)
        self.render = render

        # Make the super mario gym environment and apply wrappers
        self.env = gym.make(ENV)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.env = preprocess.GrayScaleImage(self.env,
                                             height=HIGHT,
                                             width=WIDTH,
                                             grayscale=True)
        # self.env = wrappers.Monitor(self.env, "./Super_Mario_AI/videos", force = True, write_upon_reset=True)
        self.agent = Agent(TEMPERATURE)

    def runEpisode(self):
        s = self.env.reset()
        R = 0
        while True:
            time.sleep(THREAD_DELAY)  # yield

            if self.render: self.env.render()

            a = self.agent.act(s)
            s_, r, done, info = self.env.step(a)

            if done:  # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True
Beispiel #7
0
class MarioEnv:
    def __init__(self, os='mac', display=False):
        self.display = display
        if os == 'mac' or os == 'linux':
            env = gym_super_mario_bros.make('SuperMarioBros-v0')
            self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        else:
            raise Exception("bad os")
        self.act_dim = self.env.action_space.n
        self.obs_dim = (1, 128, 128)
        print("env created with act_dim", self.act_dim, "obs_dim",
              self.obs_dim)
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # chain 2 transforms together using list.
            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
        ])

    def reset(self):
        state = self.env.reset()
        return self.__resize_image(state)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        if reward == 0:
            reward = -0.5
        state_t = self.__resize_image(state)
        return state_t, \
               np.reshape(reward, -1), \
               np.reshape(done, -1)

    def close(self):
        self.env.close()

    def __resize_image(self, state):
        state_new = cv2.resize(state, (128, 128))
        img = Image.fromarray(state_new)
        state_t = self.transform(img)[0, :, :].unsqueeze(0)
        state_t = state_t.float().to(DEVICE)
        return state_t.unsqueeze(0)

    def render(self):
        if self.display:
            self.env.render()
Beispiel #8
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

    done = True
    max_step = 5000
    print(env.observation_space.shape)
    #win下加ascii=True才会不换行
    qbar = tqdm(max_step, ascii=True)
    for step in range(max_step):
        qbar.update()
        if done:
            state = env.reset()
        action = get_action(state, env.action_space)
        state, reward, done, info = env.step(action)
        if done:
            print(str(step) + " 英雄请卷土重来" + str(info))
        env.render()
    env.close()
    qbar.close()
               cv2.resize(input, (600, 320), interpolation=cv2.INTER_NEAREST))
    if wait:
        cv2.waitKey(4)


def build_model(input_shape, actions):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(256, input_dim=4, activation='relu'))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(actions, activation="softmax"))
    return model.build()


env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2')
env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
state = env.reset()
print(f"State shape: {state.shape}")
input = state_to_tf_input(state)
print(f"Input shape: {input.shape}")
build_model(input.shape, env.action_space.n)

done = False
step = 0
while not done and step < 5000:
    step += 1
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    input = state_to_tf_input(state)
    print(f"{step}: {action} -> {reward}")
    show_input(input, True)
Beispiel #10
0
    myRL = DQNAgent(gamma=GAMMA, initial_epsilon=INITIAL_EPSILON,
                    final_epsilon=FINAL_EPSILON,
                    decay_epsilon=DECAY_EPSILON,
                    lr=LEARNING_RATE)

    for episode in range(EPISODES):
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
        done = True
        action = 0  # action 'NOOP'

        for itera in range(ITERATION):
            if done:
                env.reset()
                oldObse, _, _, _ = env.step(0)  # get the initial state
                oldObse = myRL.pre_process(oldObse)
                # oldObse = (-1, 'NOOP', 0)

            action = myRL.chooseAction(oldObse)
            
            newObse, reward, done, info = env.step(action)
            newObse = myRL.pre_process(newObse)

            myRL.remember(state=oldObse, action=action, reward=reward, next_state=newObse, done=done)
            oldObse = newObse

            if len(myRL.memory) > BATCH:
                myRL.learn_from_replay(BATCH)
            env.render()
        env.close()
Beispiel #11
0
class MarioEnvironment(Process):
    def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=4,
                 h=84,
                 w=84):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), movement)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}"
                    .format(self.episode, self.env_idx, self.steps, self.rall,
                            np.mean(self.recent_rlist), info['stage'],
                            info['x_pos'], self.max_pos))

                self.history = self.reset()
            else:
                self.child_conn.send(
                    [self.history[:, :, :], r, False, done, log_reward])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Beispiel #12
0
class MarioEnv(Process):
    def __init__(self,
                 env_id,
                 idx,
                 child_conn,
                 queue,
                 n_step,
                 is_render=False):
        super(MarioEnv, self).__init__()

        self.idx = idx
        self.env_id = env_id

        self.child_conn = child_conn
        self.queue = queue
        self.is_render = is_render
        self.n_step = n_step
        self.steps = 0
        self.episodes = 0
        self.accum_reward = 0
        self.transition = []

    def run(self):
        super(MarioEnv, self).run()

        self.env = gym_super_mario_bros.make(self.env_id)
        self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT)
        self.reset()
        print('[ Worker %2d ] ' % (self.idx), end='')
        print('Playing <', self.env_id, '>')

        self.request_action(0, False)

        while True:
            action = self.child_conn.recv()
            next_state, reward, done, info = self.env.step(action)
            self.steps += 1
            self.accum_reward += reward
            next_state = rgb2dataset(next_state)

            if self.is_render and self.idx == 0:
                self.env.render()

            # make a transition
            self.transition.append(next_state)
            if len(self.transition) > 4:
                self.transition.pop(0)

            if done:
                self.send_result(info['x_pos'])
                self.reset()
                self.request_action(reward, True)
            else:
                self.request_action(reward, False)

    def reset(self):
        state = self.env.reset()
        state = rgb2dataset(state)
        self.transition.clear()
        self.transition.append(state)

        self.steps = 0
        self.episodes += 1
        self.accum_reward = 0

    def request_action(self, reward, done):
        self.queue.put([self.idx, "OnStep", [self.transition, reward, done]])

    def send_result(self, x_pos):
        self.queue.put([
            self.idx, "Result",
            [self.episodes, self.steps, self.accum_reward, x_pos]
        ])
Beispiel #13
0
def replay_genome(genome, movements, gen):
    env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1,
                                                          rom_mode='vanilla')
    env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements)

    print('Number of genes: ', len(genome.connection_genes))
    for gene in genome.connection_genes:
        print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number,
              gene.type, gene.enabled)

    done = True
    unticked = 0
    tick_interval = 1 / 30
    last_tick_time = time.time()

    fps = 0
    frames = 0
    last_fps_time = time.time()

    for _ in range(500000):

        unticked += time.time() - last_tick_time
        last_tick_time = time.time()
        ticked = False

        # while unticked >= tick_interval:
        if done:
            state = env.reset()

        state_downscaled = get_sensor_map(env_expanded)

        action = genome.calculate_action(state_downscaled)

        # print('\rFPS: {:.3f}'.format(fps), end=' ')
        # print(vectofixedstr(action, 10), end=' ')
        action = np.argmax(action)
        print('\rtaking action', movements[action], end='', flush=True)

        state, reward, done, info = env.step(action)

        #filename = get_path_of('all_pictures/mario/')
        #imsave(filename + 'mario_' + str(_) + '.png', state)

        save_state = np.full((13, 10, 3), 255, dtype=np.int)

        COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]]

        for i in range(13):
            for j in range(10):
                if state_downscaled[(i, j)] == -1:
                    save_state[(i, j)] = COLORS[3]
                elif state_downscaled[(i, j)] == 0:
                    save_state[(i, j)] = COLORS[0]
                else:
                    save_state[(i, j)] = COLORS[1]

        save_state[(7, 2)] = COLORS[2]

        # filename = get_path_of('all_pictures/input_downscaled/')
        # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8))

        # make_controller(movements[action], _, gen)

        env.render()

        if info["life"] <= 2:
            died = True
            break

        ticked = True
        frames += 1
        unticked -= tick_interval

        # if ticked:
        #     now = time.time()
        #     if now - last_fps_time >= 1:
        #         fps = frames / (now - last_fps_time)
        #         last_fps_time = now
        #         frames = 0
        # else:
        #     time.sleep(0.001)

    env.close()
Beispiel #14
0
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return resize(screen).unsqueeze(0).to(device)


#reward needs to self design


num_episodes=5000
for i_episode in range(num_episodes):
    # Initialize the environment and state
    if (i_episode==4999):
        print("last eposide")
    env.reset()
    picture, _, _, last_info=env.step(0)
    last_screen = get_screen()
    current_screen = get_screen()
    state = current_screen
    for t in count():
        action=select_action(state,last_info['x_pos'])
        picture, reward, done, info = env.step(action.item())

        if info['flag_get']==True:
            reward=99999
        elif reward<0 and reward!=15:
            pass
        elif (reward==-15) or (info['time']<20): #dead
            reward= -99999
        else:
            # reward=info['x_pos']+info['coins']*10+info['score']+info['time']*2
Beispiel #15
0
    save_path = save_dir / AGENT_FILENAME

    if Path.is_file(save_path):
        print("Loading saved agent...")
        agent.load(save_path)

    done = False
    batch_size = 32

    for e in range(1, EPISODES + 1):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        time = 0
        while True:
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done or time >= 500:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, time, agent.epsilon))
                break
            time += 1
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        if e % 10 == 0:
            agent.save(save_path)
Beispiel #16
0
model.compile(loss='mse', optimizer=Adam(lr=learning_rate))

done = False
batch_size = 16
"""
state = env.reset()
_, _, _, info = env.step(0)
state = np.reshape(info["enemy"], [1, state_size])
"""
for i in range(8):
    for j in range(4):
        env1 = gym_super_mario_bros.make('SuperMarioBros-' + str(i + 1) + '-' +
                                         str(j + 1) + '-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env1, COMPLEX_MOVEMENT)
        state = env.reset()
        _, _, _, info = env.step(0)
        total_reward = 0
        reward_checkpoint = 3000
        state = np.reshape(info["enemy"], [1, state_size])
        #state = np.append(state,info["time"])
        #state = np.reshape(state, [1, state_size+1])
        checkpoint = info["x_pos"] + 50
        done = False
        x0 = info["x_pos"]
        y0 = info["y_pos"]
        t0 = info["time"]
        t = 0
        for k in range(10):
            quieto = 0
            total_reward = 0
            while not done:
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=False,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done
        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            # 4 frame skip
            reward = 0.0
            done = None
            for i in range(4):
                obs, r, done, info = self.env.step(action)
                if self.is_render:
                    self.env.render()
                reward += r
                if done:
                    break

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = int(info.get('flag_get', False))

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3')
        env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT)
        step = 0

        while episode < EPISODES:
            done = False

            max_x = 40
            no_progress = 0
            score = 0
            state = env.reset()

            # Making initial history with random actions
            for _ in range(5):
                next_state = state
                state, _, _, _ = env.step(0)

            state = preprocess(state)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 88, 128, 4))

            while not done:
                # Rendering code
                # Seems to be causing error in Mac OS
                #if self.thread_num==1:
                #    env.render()
                step += 1
                self.t += 1

                step_reward = 0

                action, policy = self.get_action(history)

                # Taking 3 steps with selected action
                # Mimicking frame skip
                for _ in range(6):
                    next_state, reward, done, info = env.step(action)
                    score += reward
                    step_reward += reward
                    if done:
                        break

                # Kill Mario if Mario is making no progress for 10 seconds
                x_now = info.get('x_pos')
                # Handling exception x_pos = 65535
                if x_now == 65535:
                    x_now = max_x
                if max_x < x_now:
                    max_x = x_now
                    no_progress = 0
                else:
                    no_progress += 1
                if no_progress == 150:
                    done = True
                    #reward -= 1
                    step_reward -= 1
                    score -= 1
                    print("#", self.thread_num, " STUCK")

                # Preprocessing each states
                next_state = preprocess(next_state)
                next_state = np.reshape([next_state], (1, 88, 128, 1))
                next_history = np.append(next_state,
                                         history[:, :, :, :3],
                                         axis=3)

                # Average policy max value
                self.avg_p_max += np.amax(
                    self.actor.predict(np.float32(history / 255.)))

                # Appending sample
                self.append_sample(history, action, step_reward)
                history = next_history
                if self.t >= self.t_max or done:
                    #if done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # Recording training information

                    episode += 1
                    print("#", self.thread_num,
                          "  episode:", episode, "  score:",
                          format(score,
                                 '.2f'), "  step:", step, "max_x :", max_x)

                    stats = [score, self.avg_p_max / float(step), step]
                    for i in range(len(stats)):
                        self.sess.run(self.update_ops[i],
                                      feed_dict={
                                          self.summary_placeholders[i]:
                                          float(stats[i])
                                      })
                    summary_str = self.sess.run(self.summary_op)
                    self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
        env.render()

        # Get out Q values for the current state
        Q = model.predict(state, batch_size=1)
        print(Q)

        # 10% of the time, we'll just take a random action
        if np.random.rand(1) < epsilon:
            #print("Taking random action")
            action = env.action_space.sample()
        else:
            #print("Taking non-random action")
            action = np.argmax(Q)

        # Take the action
        new_state, reward, game_over, _ = env.step(action)
        new_state = process_image(new_state)
        batch.append([state, action, reward, Q, new_state])
        batch_count += 1
        if batch_count == batch_size:
            process_batch(batch)
            batch = []
            batch_count = 0

        # Explore more as we get further closer and closer toward our
        # Current best score
        # This way we stick with what works, and only explore when we get closer to
        # "uncharted territory"
        #epsilon = 0.4 * ((reward + 300)/(max_episode_reward + 300))
        episode_reward += reward
        state = new_state
Beispiel #20
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    # width = 84
    # resize_height = 110
    # final_height = 84
    width=128
    resize_height = 168
    final_height = 128
    size = [channels, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'

    gamma = 0.95

    start_epsilon = 0.3
    stop_epsilon = 0.01
    epsilon_decay = 0.00025

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    data_file = 'data_loader'
    model_file = 'mario_agent'
    continue_train = True
    model.load_state_dict(torch.load(model_file))

    if continue_train:
        target_model.load_state_dict(torch.load(model_file))

    lr = 0.00005
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file ='total_reward.txt'


    if not continue_train:
        with open(total_reward_file, 'w') as f:
            f.write('Reward\tSteps\n')


    max_steps = 5000
    num_eps = 5000

    if continue_train:
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            data.batch_size = batch_size
    else:
        data = dataset(replay_capacity, batch_size, replay_dir, size)

        #initialize memory with 100 experiences
        done = True
        for i in range(100):
            if done:
                state = env.reset()
                state = preprocess(state, [resize_height, width], final_height)
                state = torch.cat((state, state, state, state))

            action = random.randint(0,len(movement)-1)
            next_state, reward, done, info = env.step(int(action))

            # if reward>0:
            #     reward = 1
            # else:
            #     reward = -1
            reward /= 15
            if reward == 0:
                reward = -0.1

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)

            state = next_state


    tau = 0
    max_tau = 2000
    decay_step = 0
    farthest = 3000
    cur_x = 1

    #training loop
    for episode in range(num_eps):
        print('Episode {}'.format(episode+1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            tau += 1


            #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step)
            epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest)))
            if epsilon < stop_epsilon:
                epsilon = stop_epsilon

            if random.random() < epsilon:
                action = random.randint(0,len(movement)-1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            cur_x = info['x_pos']

            if cur_x > farthest:
                farthest = cur_x

            # if reward > 0:
            #     reward = 1
            # else:
            #     reward = -1

            reward /= 15
            if reward == 0:
                reward = -0.1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width], final_height)
            next_state = torch.cat((state[1:,:,:], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            batch = data.get_batch(model, target_model, device, gamma)
            loss, abs_err = train(model, device, optimizer, batch)

            data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err)))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        decay_step += step
        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(data_file, 'wb') as f:
                pickle.dump(data, f)


    env.close()
Beispiel #21
0
     oldi = {
         'coins': 0,
         'flag_get': False,
         'life': 2,
         'score': 0,
         'stage': 1,
         'status': 'small',
         'time': 400,
         'world': 1,
         'x_pos': 40
     }
     while oldi['life'] == 2:
         action_val, gradients_val = sess.run(
             [action, gradients],
             feed_dict={X: obs.reshape(-1, 240, 256, 3)})
         obs, rwd, done, info = env.step(action_val[0][0])
         creward = reward(info, oldi)
         oldi = info
         current_rewards.append(creward)
         current_gradients.append(gradients_val)
         #env.render()
     all_rewards.append(current_rewards)
     all_gradients.append(current_gradients)
 all_rewards = discnormrewards(all_rewards)
 feed_dict = {}
 for var_index, gradient_placeholder in enumerate(
         gradient_placeholders):
     mean_gradients = np.mean([
         reward * all_gradients[game_index][step][var_index]
         for game_index, rewards in enumerate(all_rewards)
         for step, reward in enumerate(rewards)
Beispiel #22
0

if __name__ == "__main__":

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT)
    agent = DQNAgent(action_size)

    total_rewards, episodes = [], []

    for e in range(EPISODES):
        state = env.reset()
        step, total_reward = 0, 0
        done = False
        for _ in range(8):
            start, _, _, _ = env.step(0)
        start = preprocess2(start)
        start = np.reshape(start, (1, 88, 128, 1))
        history = np.stack((start, start, start, start), axis=3)
        history = np.reshape([history], (1, 88, 128, 4))
        while not done:
            if agent.render:
                env.render()
            global_step += 1
            step += 1
            step_reward = 0
            epsilon = agent.epsilon_now(e)
            action = agent.act(history)
            for _ in range(8):
                next_state, reward, done, _ = env.step(action)
                step_reward += reward
Beispiel #23
0
    ['A'],
    ['B'],
    ['right'],
    ['right', 'A'],
    ['right', 'B'],
    ['right', 'A', 'B'],
    ['left'],
    ['left', 'A'],
    ['left', 'B'],
    ['left', 'A', 'B'],
    #    ['down'],
    #    ['up']
]

_env = gym_super_mario_bros.make('SuperMarioBros-v0')
#_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle')
env = BinarySpaceToDiscreteSpaceEnv(_env, movements)
env = DummyVecEnv([lambda: env])
model = PPO2(policy=CnnPolicy, env=env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()

while True:
    action, _info = model.predict(obs)

    obs, rewards, dones, info = env.step(action)
    print("학습끝")
    print(rewards)
    env.render()
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import torch
import cv2

env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

done = True
for step in range(5000):
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()

env.close()
Beispiel #25
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15

    epsilon = 0.0

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))

    max_steps = 5000
    num_eps = 1

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            state = next_state

            env.render()
            time.sleep(0.03)

            if done:
                break

    env.close()
Beispiel #26
0

# Instantiate memory
memory = Memory(max_size=memory_size)
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        state = env.reset()

        state, stacked_frames = stack_frames(stacked_frames, state, True)

    # Get the next_state, the rewards, done by taking a random action
    choice = random.randint(1, env.action_space.n) - 1
    action = possible_actions[choice]

    next_state, reward, done, _ = env.step(choice)

    # env.render()

    # Stack the frames
    next_state, stacked_frames = stack_frames(stacked_frames, next_state,
                                              False)

    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)

        # Add experience to memory
        memory.add((state, action, reward, next_state, done))  # TODO
Beispiel #27
0
## Base model to run the game, using random movements
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
from aux import *
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

done = True
oldi = {
    'coins': 0,
    'flag_get': False,
    'life': 2,
    'score': 0,
    'stage': 1,
    'status': 'small',
    'time': 400,
    'world': 1,
    'x_pos': 40
}
for step in range(100):
    if done:
        state = env.reset()
    state, rwd, done, info = env.step(1)  #env.action_space.sample())
    print(reward(info, oldi), "vs", rwd)
    print(env.observation_space.shape)
    oldi = info
    env.render()

env.close()
Beispiel #28
0
class Game:
    def __init__(self, game_id, obs_size, skip_frame=4, mode='train'):
        self.game_id = game_id
        env = gym_super_mario_bros.make(game_id)
        temp_obs = env.reset()
        height, width, _ = temp_obs.shape
        self.env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

        self.obs_last2max = np.zeros((2, obs_size, obs_size, 1), np.uint8)

        self.obstack = np.zeros((obs_size, obs_size, 4))
        self.rewards = []
        self.lives = 3
        self.skip = skip_frame
        self.mode = mode
        if self.mode == 'play':
            self.monitor = Monitor(width=width, height=height)

    def step(self, action, monitor=False):
        reward = 0.0
        done = False

        for i in range(self.skip):
            obs, r, done, info = self.env.step(action)

            if self.mode == 'play':
                self.monitor.record(obs)

            if i >= 2:
                self.obs_last2max[i % 2] = self._process_obs(obs)

            # super mario's reward is cliped in [-15.0, 15.0]
            reward += r / 15.0
            lives = info['life']

            if lives < self.lives:
                done = True

            self.lives = lives

            if done:
                break

        self.rewards.append(reward)

        if done:
            episode_info = {
                "reward": sum(self.rewards),
                "length": len(self.rewards)
            }
            self.reset()
        else:
            episode_info = None

            obs = self.obs_last2max.max(axis=0)

            self.obstack = np.roll(self.obstack, shift=-1, axis=-1)
            self.obstack[..., -1:] = obs

        return self.obstack, reward, done, episode_info

    def reset(self):
        obs = self.env.reset()

        obs = self._process_obs(obs)
        self.obstack[..., 0:] = obs
        self.obstack[..., 1:] = obs
        self.obstack[..., 2:] = obs
        self.obstack[..., 3:] = obs
        self.rewards = []

        self.lives = 3

        return self.obstack

    @staticmethod
    def _process_obs(obs):

        obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA)
        return obs[:, :, None]
    ['right'],
    ['right', 'A'],
    ['right', 'B'],
    ['right', 'A', 'B'],
    ['A'],
    ['left'],
    ['left', 'A'],
    ['left', 'B'],
    ['left', 'A', 'B'],
    ['down'],
]
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
done = True

for i in range(1000):
    if done:
        state = env.reset()

    action = env.action_space.sample()

    observation , reward, done, info = env.step(action)

    print(state)

    print(reward)

    #env.render()

env.close()
Beispiel #30
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    movement.append(['B'])
    movement.append(['down'])
    movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    width = 84
    resize_height = 110
    final_height = 84
    size = [channels, final_height, width]

    batch_size = 32
    replay_capacity = 100000
    replay_dir = '/home/hansencb/mario_replay/'
    epsilon = 1
    gamma = 0.9

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 5000
    num_eps = 1000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width],
                                    final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, device, gamma))

            state = next_state

            env.render()
            #time.sleep(0.03)

            if done:
                with open(total_reward_file, 'a') as f:
                    f.write('{}\t{}\n'.format(episode_reward, step))

                break

        epsilon -= (1 / num_eps)
        if episode % 10 == 0:
            target_model.load_state_dict(model.state_dict())

            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()