Beispiel #1
0
def train_agent(args):
    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, args.n_actions).to(device)
    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    if args.policyNet:
        target_net.load_state_dict(
            torch.load(args.policyNet, map_location=device))

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    args.steps_done = 0

    num_episodes = 1

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, policy_net, args, device)
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(optimizer, memory, policy_net, target_net, args,
                           device)
            if done:
                episode_durations.append(t + 1)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % args.target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), args.output_policyNet)
            torch.save(target_net.state_dict(), args.output_targetNet)

        if i_episode % 10 == 0:
            print(f'{i_episode+1}/{num_episodes}: Completed Episode.')

    print('Complete')
    env.close()

    torch.save(policy_net.state_dict(), args.output_policyNet)
    torch.save(target_net.state_dict(), args.output_targetNet)
Beispiel #2
0
def process_single_session(session_path,
                           output_path=None,
                           render=False,
                           length=None):

    with open(session_path) as json_file:
        data = json.load(json_file)

    if output_path is not None:
        output_path = Path(output_path)
        output_path.mkdir(exist_ok=True, parents=True)
        shutil.copyfile(session_path, output_path.joinpath("data.json"))
        output_path.joinpath("frames").mkdir(exist_ok=True)

    first_world = "SuperMarioBros-1-1-v0"
    env = gym_super_mario_bros.make(first_world)

    next_state = env.reset()

    world = 1
    stage = 1
    stage_num = 0
    frame_number = 0
    steps = 0

    for i, action in enumerate(data["obs"]):

        if length is not None:
            if i >= length:
                break

        if render:
            env.render()

        next_state, _, done, info = env.step(action)
        steps += 1

        if output_path is not None:
            cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB)
            impath = str(
                output_path.joinpath(f"frames/frame_{frame_number}.png"))
            cv2.imwrite(impath, cvt_state)

        finish = False
        frame_number += 1

        if info["flag_get"]:
            finish = True

        if done:
            done = False

            if finish or steps >= 16000:
                stage_num += 1
                world, stage, new_world = make_next_stage(
                    world, stage, stage_num)
                env.close()
                env = gym_super_mario_bros.make(new_world)
                finish = False
                steps = 0

            next_state = env.reset()
def run(training_mode, pretrained):

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = make_env(env)  # Wraps the environment so that frames are grayscale
    observation_space = env.observation_space.shape
    action_space = env.action_space.n

    agent = DQNAgent(state_space=observation_space,
                     action_space=action_space,
                     max_memory_size=30000,
                     batch_size=64,
                     gamma=0.90,
                     lr=0.00025,
                     dropout=0.,
                     exploration_max=1.0,
                     exploration_min=0.02,
                     exploration_decay=0.99,
                     double_dq=True,
                     pretrained=pretrained)

    num_episodes = 10001
    env.reset()
    total_rewards = []
    open(f'training_log.txt',
         'w').write(f'ep_num\tsteps\taction\treward\tterminal\ttotal_reward\n')

    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor([state])
        total_reward = 0
        steps = 0
        while True:
            if not training_mode:
                show_state(env, ep_num,
                           f"step: {steps} reward: {int(total_reward)}")
            action = agent.act(state)
            steps += 1

            state_next, reward, terminal, info = env.step(int(action[0]))
            total_reward += reward
            state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)

            terminal = torch.tensor([int(terminal)]).unsqueeze(0)

            if training_mode:
                agent.remember(state, action, reward, state_next, terminal)
                agent.experience_replay()

            state = state_next
            open(f'training_log.txt', 'a').write(
                f'{ep_num}\t{steps}\t{action.item()}\t{reward.item()}\t{terminal}\t{total_reward}\n'
            )
            if terminal:
                break

        total_rewards.append(total_reward)

        print("Total reward after episode {} is {}".format(
            ep_num + 1, total_rewards[-1]))
        num_episodes += 1

    if training_mode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.ending_position, f)
        with open("num_in_queue.pkl", "wb") as f:
            pickle.dump(agent.num_in_queue, f)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(total_rewards, f)
        if agent.double_dq:
            torch.save(agent.local_net.state_dict(), "dq1.pt")
            torch.save(agent.target_net.state_dict(), "dq2.pt")
        else:
            torch.save(agent.dqn.state_dict(), "dq.pt")
        torch.save(agent.STATE_MEM, "STATE_MEM.pt")
        torch.save(agent.ACTION_MEM, "ACTION_MEM.pt")
        torch.save(agent.REWARD_MEM, "REWARD_MEM.pt")
        torch.save(agent.STATE2_MEM, "STATE2_MEM.pt")
        torch.save(agent.DONE_MEM, "DONE_MEM.pt")

    env.close()

    if num_episodes > 500:
        plt.title("Episodes trained vs. Average Rewards (per 500 eps)")
        plt.plot([0 for _ in range(500)] + np.convolve(
            total_rewards, np.ones((500, )) / 500, mode="valid").tolist())
        plt.show()
Beispiel #4
0
def run(run_name, existing_model):

    # Create log dir
    log_dir = "./monitor_logs/"
    os.makedirs(log_dir, exist_ok=True)

    print("Setting up environment...")
    env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = EpisodicLifeEnv(env)

    # Preprocessing
    env = WarpFrame(env)
    env = FrameStack(env, n_frames=hp.FRAME_STACK)

    # Evaluate every kth frame and repeat action
    env = MaxAndSkipEnv(env, skip=hp.FRAME_SKIP)

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env, log_dir)

    # Save a checkpoint every 1000 steps
    checkpoint_callback = CheckpointCallback(save_freq=25000,
                                             save_path='./models/',
                                             name_prefix=run_name)

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./models/',
                                 log_path='./models/',
                                 eval_freq=250000,
                                 deterministic=True,
                                 render=False)

    print("Compiling model...")

    if existing_model:
        try:
            model = DQN.load(existing_model,
                             env,
                             tensorboard_log="./mario_tensorboard/")
        except:
            print(f"{existing_model} does not exist!")
            exit(0)
    else:
        model = DQN(
            LnCnnPolicy,
            env,
            batch_size=hp.
            BATCH_SIZE,  # Optimizable (higher batch sizes ok according to https://arxiv.org/pdf/1803.02811.pdf)
            verbose=1,
            learning_starts=10000,
            learning_rate=hp.LEARNING_RATE,
            exploration_fraction=hp.EXPLORATION_FRACT,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.1,
            prioritized_replay=True,
            prioritized_replay_alpha=hp.P_REPLAY_ALPHA,
            train_freq=hp.TRAINING_FREQ,
            target_network_update_freq=hp.TARGET_UPDATE_FREQ,
            tensorboard_log="./mario_tensorboard/")

    print("Training starting...")
    with ProgressBarManager(hp.TIME_STEPS) as progress_callback:
        model.learn(
            total_timesteps=hp.TIME_STEPS,
            log_interval=1,
            callback=[progress_callback, checkpoint_callback, eval_callback],
            tb_log_name=run_name)

    print("Done! Saving model...")
    model.save("models/{}_final".format(run_name))
Beispiel #5
0
        self.g_opt.zero_grad()
        loss.backward()
        for lp, gp in zip(self.model.l_net.parameters(),
                          self.g_net.parameters()):
            gp._grad = lp.grad.clone().cpu()
        self.g_opt.step()

        self.model.l_net.load_state_dict(self.g_net.state_dict())


if __name__ == '__main__':
    writer = SummaryWriter('runs/Vanilla')

    ####### Env Settings ##########
    env_id = 'SuperMarioBros-v2'
    env = gym_super_mario_bros.make(env_id)
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    s_dim = 4  # transition
    a_dim = env.action_space.n
    env.close()
    ###############################

    ####### MultiProcessing Settings ##########
    num_worker = 1
    workers = []
    parent_conns = []
    queue = Queue()
    ###########################################

    ##### Etc Settings ########################
    max_episode = 1000000
Beispiel #6
0
            running_add = reward[t] + args.gamma * running_add * (1 - done[t])
            discounted_return[t] = running_add

        # For Actor
        adv = discounted_return - value

    return discounted_return, adv


if __name__ == '__main__':

    args = parser.parse_args()

    # get enviroment information
    env = BinarySpaceToDiscreteSpaceEnv(
        gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT)
    input_size = env.observation_space.shape
    output_size = env.action_space.n

    env.close()

    # setup 
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    tag = ["test", "train"][int(args.training)]
    log_dir = os.path.join(args.logdir, '{}_{}_{}_{}'.format(
        args.env_id, args.name, current_time, tag))
    writer = SummaryWriter(log_dir)

    model_path = 'saved/{}_{}_{}.model'.format(args.env_id, 
                    args.name, current_time)
    load_model_path = 'saved/{}'.format(args.prev_model)
Beispiel #7
0
def create_mario_env(env_id):
    env = gym_super_mario_bros.make(env_id)
    env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
    env = wrap_mario(env)
    return env
Beispiel #8
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    movement.append(['B'])
    movement.append(['down'])
    movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 4
    width = 84
    resize_height = 110
    final_height = 84

    epsilon = 0.0

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))

    max_steps = 5000
    num_eps = 1

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width], final_height)
        state = torch.cat((state, state, state, state))
        action = 0

        episode_reward = 0

        for step in range(max_steps):
            if step % 3 == 0:
                if random.random() < epsilon:
                    action = random.randint(0, len(movement) - 1)
                else:
                    q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            print(reward)

            next_state = preprocess(next_state, [resize_height, width],
                                    final_height)
            next_state = torch.cat((state[1:, :, :], next_state))

            state = next_state

            env.render()
            time.sleep(0.03)

            if done:
                break

    env.close()
Beispiel #9
0
def main():
    movement = SIMPLE_MOVEMENT
    movement.append(['left', 'A'])
    movement.append(['left', 'B'])
    movement.append(['left', 'A', 'B'])
    #movement.append(['B'])
    #movement.append(['down'])
    #movement.append(['up'])

    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, movement)

    #channels is acting as the number of frames in history
    #if resize_height and height are different, assert final_height < resize_height and image will be cropped
    channels = 3
    frames = 4
    width = 128
    resize_height = 180
    final_height = 128
    bottom_chop = 15
    size = [channels * frames, final_height, width]

    batch_size = 16
    replay_capacity = 100000
    replay_dir = '/home-local/bayrakrg/mario_replay/'
    start_epsilon = 1.0
    stop_epsilon = 0.01
    epsilon_decay = 0.00005
    gamma = 0.75

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")

    model = simple_net(channels, len(movement), device).to(device)
    target_model = simple_net(channels, len(movement), device).to(device)

    model_file = 'mario_agent'
    model.load_state_dict(torch.load(model_file))
    target_model.load_state_dict(torch.load(model_file))

    lr = 0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    total_reward_file = 'total_reward.txt'
    with open(total_reward_file, 'w') as f:
        f.write('Reward\tSteps\n')

    max_steps = 500
    num_eps = 10000

    data = dataset(replay_capacity, batch_size, replay_dir, 1, size)

    tau = 0
    max_tau = 10000
    decay_step = 0

    for episode in range(num_eps):
        print('Episode {}'.format(episode + 1))
        state = env.reset()
        state = preprocess(state, [resize_height, width, 3], final_height,
                           bottom_chop)
        state = torch.cat((state, state, state, state))
        action = 0
        episode_reward = 0

        for step in range(max_steps):
            tau += 1
            decay_step += 1

            epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp(
                -epsilon_decay * decay_step)

            if random.random() < epsilon:
                action = random.randint(0, len(movement) - 1)
            else:
                q_val, action, q_vals = maxQ(state, model, device)

            next_state, reward, done, info = env.step(int(action))

            if step == max_steps - 1:
                reward -= 10

            if reward > 0:
                reward = 1
            else:
                reward = -1

            episode_reward += reward

            next_state = preprocess(next_state, [resize_height, width, 3],
                                    final_height, bottom_chop)
            next_state = torch.cat((state[3:, :, :], next_state))

            trans = transition(state, action, reward, next_state, done)
            data.add(trans)
            train(model, device, optimizer,
                  data.get_batch(model, target_model, device, gamma))

            state = next_state

            env.render()

            if tau > max_tau:
                target_model.load_state_dict(model.state_dict())
                tau = 0

            if done:
                break

        with open(total_reward_file, 'a') as f:
            f.write('{}\t{}\n'.format(episode_reward, step))

        if episode % 5 == 0:
            with open(model_file, 'wb') as f:
                torch.save(model.state_dict(), f)

    env.close()
Environment
'''''''''''''''"

Initialize Environment
------------------------

In Mario, the environment consists of tubes, mushrooms and other
components.

When Mario makes an action, the environment responds with the changed
(next) state, reward and other info.
"""

# Initialize Super Mario environment
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")

# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [["right"], ["right", "A"]])

env.reset()
next_state, reward, done, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
"""Preprocess Environment
------------------------

Environment data is returned to the agent in ``next_state``. As you saw
above, each state is represented by a ``[3, 240, 256]`` size array.
Often that is more information than our agent needs; for instance,
Beispiel #11
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    agent = DQNAgent(action_size=7)

    scores, episodes, global_step = [], [], 0

    global_start = datetime.now()
    local_start = datetime.now()

    print()
    print("=" * 100)
    print("RL environment initialized")
    print("=" * 100)
    print()
    gc.collect()

    for e in range(1000):
        e = e + 1
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()

        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        state = agent.pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 240, 256, 4))

        count_epsilon = 0
        count_greedy = 0

        coinStatus = 0
        marioStatus = "small"
        flagStatus = False
        softReward = 0
        lifeStatus = 2

        while not done:
            # if agent.render:
            #     env.render()
            global_step += 1
            step += 1
            # 바로 전 4개의 상태로 행동을 선택
            action, res = agent.get_action(history)
            if res:
                count_epsilon += 1
            else:
                count_greedy += 1

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            observe, reward, done, info = env.step(action)
            # 각 타임스텝마다 상태 전처리
            next_state = agent.pre_processing(observe)
            next_state = np.reshape([next_state], (1, 240, 256, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
            agent.avg_q_max += np.amax(agent.model.predict(np.float32(history / 255.))[0])
            if start_life > info['life']:
                dead = True
                start_life = info['life']
            # reward = np.clip(reward, -1., 1.)
            real_reward = reward

            ###
            ###
            ###
            # reward = reward
            # if coinStatus != info["coins"]:
            #     coinStatus = info["coins"]
            #     reward = reward + 10
            # if marioStatus != info["status"]:
            #     marioStatus = info["status"]
            #     reward = reward + 200
            # if flagStatus != info["flag_get"]:
            #     flagStatus = info["flag_get"]
            #     reward = reward + 200
            # if lifeStatus != info["life"]:
            #     lifeStatus = info["life"]
            #     reward = reward - 20
            #
            # if info["x_pos"] < 10:
            #     info["x_pos"] = 10
            # if info["time"] < 10:
            #     info["time"] = 10
            #
            # reward = reward + math.log((info["x_pos"] / info["time"]) + info["x_pos"])

            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
            agent.append_sample(history, action, reward, next_history, dead)
            if len(agent.memory) >= agent.train_start:
                agent.train_model()
            # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
            if global_step % agent.update_target_rate == 0:
                agent.update_target_model()

            # score += reward
            score += real_reward

            if dead:
                dead = False
            else:
                history = next_history

            if global_step == 0:
                pass
            elif global_step % 1000 == 0:
                print("local step : {}, time : {} sec, epsilon : {}".format(global_step, (datetime.now() - local_start).seconds, agent.epsilon))
                local_start = datetime.now()

            if done:
                ep_result = "episode : {}, score : {}, memory : {}, step : {}".format(e, score, len(agent.memory), global_step)
                print(ep_result)
                print("epsilon : {}, greedy : {}".format(count_epsilon, count_greedy))
                print()
                print("time elapsed : {} sec".format((datetime.now() - global_start).seconds))
                global_start = datetime.now()
                agent.epsilon = agent.epsilon - agent.epsilon_decay_step
                print("epsilon decay to {}!".format(agent.epsilon))
                print()

                slack_msg(ep_result)

                # if score > 2000 and score <= 3000:
                #     agent.epsilon = 0.075
                # elif score > 3000 and score <= 5000:
                #     agent.epsilon = 0.05
                # elif score > 5000 and score <= 10000:
                #     agent.epsilon = 0.005

                agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0

        # 1000 에피소드마다 모델 저장
        if e == 0:
            pass
        elif e % 2 == 0:
            agent.model.save_weights("./dqn.h5")
            # dump(agent.memory, "memory.joblib")
            print("model saved!")
            print()

        gc.collect()
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = JoypadSpace(env, SIMPLE_MOVEMENT)
        # env = gym.make(env_name)
        # env.render()

        step = 0

        gc.collect()

        while episode < EPISODES:
            done = False
            dead = False

            score, start_life = 0, 5
            observe = env.reset()
            next_observe = observe

            # 0~30 상태동안 정지
            for _ in range(random.randint(1, 30)):
                observe = next_observe
                next_observe, _, _, _ = env.step(1)

            state = pre_processing(next_observe, observe)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 240, 256, 4))

            coinStatus = 0
            marioStatus = "small"
            flagStatus = False
            softReward = 0
            lifeStatus = 2

            while not done:
                step += 1
                self.t += 1
                observe = next_observe
                action, policy = self.get_action(history)

                # # 1: 정지, 2: 왼쪽, 3: 오른쪽
                # if action == 0:
                #     real_action = 1
                # elif action == 1:
                #     real_action = 2
                # else:
                #     real_action = 3
                #
                # # 죽었을 때 시작하기 위해 발사 행동을 함
                # if dead:
                #     action = 0
                #     real_action = 1
                #     dead = False

                # 선택한 행동으로 한 스텝을 실행
                next_observe, reward, done, info = env.step(action)

                # 각 타임스텝마다 상태 전처리
                next_state = pre_processing(next_observe, observe)
                next_state = np.reshape([next_state], (1, 240, 256, 1))
                next_history = np.append(next_state, history[:, :, :, :3], axis=3)

                # 정책의 최대값
                self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.)))

                real_reward = reward
                if start_life > info['life']:
                    dead = True
                    start_life = info['life']

                # ###
                # if coinStatus != info["coins"]:
                #     coinStatus = info["coins"]
                #     reward = reward + 10
                # if marioStatus != info["status"]:
                #     marioStatus = info["status"]
                #     reward = reward + 200
                # if flagStatus != info["flag_get"]:
                #     flagStatus = info["flag_get"]
                #     reward = reward + 200
                # if lifeStatus != info["life"]:
                #     lifeStatus = info["life"]
                #     reward = reward - 200
                #
                # if info["x_pos"] < 10:
                #     info["x_pos"] = 10
                # if info["time"] < 10:
                #     info["time"] = 10
                #
                # reward = reward + ((info["x_pos"] / info["time"]) + info["x_pos"]) / 100

                score += real_reward
                # reward = np.clip(reward, -1., 1.)

                # 샘플을 저장
                self.append_sample(history, action, reward)

                gc.collect()

                if dead:
                    history = np.stack((next_state, next_state, next_state, next_state), axis=2)
                    history = np.reshape([history], (1, 240, 256, 4))
                else:
                    history = next_history

                # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행
                if self.t >= self.t_max or done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # 각 에피소드 당 학습 정보를 기록
                    episode += 1
                    ep_res = "episode: {},  score: {}, step: {}".format(episode, score, step)
                    print(ep_res)

                    if episode % 20 == 0:
                        slack_msg(ep_res)

                    # stats = [score, self.avg_p_max / float(step), step]
                    # for i in range(len(stats)):
                    #     self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) })
                    # summary_str = self.sess.run(self.summary_op)
                    # self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
def replay_game_from_actions(action_filepath, video_filepath,
                             video_info_filepath, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(video_info_filepath) as f:
        video_info = json.load(f)

    with open(action_filepath) as json_file:
        data = json.load(json_file)

    cap = None

    if os.path.exists(video_filepath):
        cap = cv2.VideoCapture(video_filepath)

    first_world = "SuperMarioBros-1-1-v0"
    env = gym_super_mario_bros.make(first_world)

    next_state = env.reset()

    world = 1
    stage = 1
    stage_num = 0

    video_frame_length = 1 / 30

    video_start = video_info["start_time"]
    video_stop = video_info["stop_time"]
    game_start = data["start_time"]
    game_stop = data["stop_time"]

    print("Frame: %s" % str(video_frame_length))
    print("VT: %s" % str(video_stop - video_start))
    print("GT: %s" % str(game_stop - game_start))
    print("VS: %s" % str(video_start))
    print("GS: %s" % str(game_start))

    skipped_frames = 0

    while video_start < game_start:
        ret, frame = cap.read()
        video_start += video_frame_length
        skipped_frames += 1

    print("Skipped: %s" % str(skipped_frames))
    print("VS: %s" % str(video_start))
    print("GS: %s" % str(game_start))

    states = []

    is_first = True
    finish = False
    frame_number = 1

    steps = 0
    counter = 1

    for action in data["obs"]:

        next_state, reward, done, info = env.step(action)
        steps += 1

        if is_first:
            is_first = False
        else:

            if cap is not None:
                ret, frame = cap.read()
                if counter % 30 == 0:
                    cv2.imwrite(
                        os.path.join(output_dir, "face_%s.png" % frame_number),
                        frame)

            if counter % 30 == 0 or counter % 30 == 1:
                cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB)
                cv2.imwrite(
                    os.path.join(output_dir, "game_%s.png" % frame_number),
                    cvt_state)

            is_first = True
            frame_number += 1
            counter += 1

        if info["flag_get"]:
            finish = True

        if done:
            done = False
            end = time.time()

            if finish or steps >= 16000:
                stage_num += 1
                world, stage, new_world = make_next_stage(
                    world, stage, stage_num)
                env.close()
                env = gym_super_mario_bros.make(new_world)
                finish = False
                steps = 0

            next_state = env.reset()
Beispiel #14
0
def make_env(world, level, v="v1"):
    env_0 = gym_super_mario_bros.make("SuperMarioBros-" + str(world) + "-" +
                                      str(level) + "-" + v)  #Same as gym.make
    return BinarySpaceToDiscreteSpaceEnv(env_0, Moves)
Beispiel #15
0
 def _make():
     env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
     env = JoypadSpace(env, SIMPLE_MOVEMENT)
     env = wrap_mario(env)
     return env
Beispiel #16
0
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT)
        step = 0

        while episode < EPISODES:
            done = False

            max_x = 40
            no_progress = 0
            score = 0
            state = env.reset()

            '''
            # Making initial history with random actions
            # Seems to be not needed in LSTM
            for _ in range(5):
                next_state = state
                state, _, _, _ = env.step(random.randint(0, 12))
            '''
            state = crop_img(state)
            state = np.reshape([state], (1, 88, 128, 3))

            while not done:
                # Rendering code
                # Seems to be causing error in Mac OS
                #if self.thread_count==1:
                    #env.render()
                step += 1
                self.t += 1

                action, policy = self.get_action(state)

                # Taking 6 steps with selected action
                # Mimicking frame skip
                for _ in range(6):
                    next_state, reward, done, info = env.step(action)
                    score += reward
                    if done:
                        break

                # Kill Mario if Mario is making no progress for 10 seconds
                x_now = info.get('x_pos')
                # Handling exception x_pos = 65535
                if x_now == 65535:
                    x_now = max_x
                if max_x < x_now:
                    max_x = x_now
                    no_progress = 0
                else:
                    no_progress += 1
                if no_progress == 200:
                    done = True
                    reward -= 1
                    print("#",self.thread_count, " STUCK")
                # Preprocessing each states
                #next_state = crop_img(next_state)
                next_state = np.reshape([crop_img(next_state)], (1, 88, 128, 3))


                # Average policy max value
                self.avg_p_max += np.amax(self.actor.predict(
                    np.float32(state / 255.)))

                score += reward

                # Appending sample
                state = next_state
                self.append_sample(state, action, reward)
                if self.t >= self.t_max or done:
                #if done:
                    self.train_model(done)
                    self.update_local_model()
                    #self.reset_lstm_state()
                    self.t = 0

                if done:
                    # Recording training information

                    episode += 1
                    print("#", self.thread_count, "  episode:", episode, "  score:", format(score, '.2f'), "  step:",
                          step, "max_x :", max_x)

                    stats = [score, self.avg_p_max / float(step),
                             step]
                    for i in range(len(stats)):
                        self.sess.run(self.update_ops[i], feed_dict={
                            self.summary_placeholders[i]: float(stats[i])
                        })
                    summary_str = self.sess.run(self.summary_op)
                    self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
def replay_game_from_actions(action_filepath, video_filepath,
                             video_info_filepath, gap_path, output_dir):

    stage_order_len = len(_STAGE_ORDER)

    with open(video_info_filepath) as json_file:
        video_info = json.load(json_file)

    cap = cv2.VideoCapture(video_filepath)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(action_filepath) as json_file:
        data = json.load(json_file)

    first_world = 'SuperMarioBros-1-1-v0'
    env = gym_super_mario_bros.make(first_world)

    next_state = env.reset()
    start = time.time()

    world = 1
    stage = 1
    stage_num = 0

    video_frame_length = 1 / 30
    video_start = video_info['start_time']
    video_stop = video_info['stop_time']
    game_start = data['start_time']
    game_stop = data['stop_time']

    video_time = video_stop - video_start
    game_time = game_stop - game_start

    print('Frame: ' + str(video_frame_length))
    print('VT:' + str(video_time))
    print('GT:' + str(game_time))
    print('VS:' + str(video_start))
    print('GS:' + str(game_start))

    skipped_frames = 0

    while video_start < game_start:
        ret, frame = cap.read()
        video_start += video_frame_length
        skipped_frames += 1

    print('Skipped: ' + str(skipped_frames))
    print('VS:' + str(video_start))
    print('GS:' + str(game_start))

    is_first = True
    no = 0
    finish = False

    steps = 0

    total_steps = 0
    gap_indices = []

    counter = 1

    for action in data['obs']:

        env.render()

        next_state, reward, done, info = env.step(action)
        steps += 1
        total_steps += 1

        #Capture 1 game-frames for each video-frame by skipping every 2nd frame
        cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB)
        if is_first:
            is_first = False
        else:
            if counter % 30 == 0 or counter % 30 == 1:
                cv2.imwrite(
                    os.path.join(output_dir, "game_" + str(no) + ".png"),
                    cvt_state)
            is_first = True
            no += 1
            counter += 1

        if info['flag_get']:
            finish = True

        if done:
            done = False
            end = time.time()

            if finish or steps >= 16000:
                stage_num += 1
                world, stage, new_world = make_next_stage(
                    world, stage, stage_num)
                env.close()
                env = gym_super_mario_bros.make(new_world)
                finish = False
                steps = 0
                gap_indices.append(total_steps)

            next_state = env.reset()

    #Extract video
    n_gaps = len(gap_indices)

    n_actions = len(data['obs'])
    missing = 126000 - n_actions
    video_frames_to_skip = missing / 2
    avg_gap_len = int(video_frames_to_skip / n_gaps)
    extra = video_frames_to_skip % n_gaps

    skips = 0
    counter = 1

    first = True
    print('Extracting video')
    for i in range(n_actions):
        if first:
            first = False
            i += 1
        else:
            first = True
            ret, frame = cap.read()
            if not ret:
                break
            if counter % 30 == 0:
                cv2.imwrite(
                    os.path.join(output_dir,
                                 "face_" + str(counter - 1) + ".png"), frame)
            i += 1
            counter += 1
        if i in gap_indices:
            skips += 1
            for j in range(int(avg_gap_len)):
                ret, frame = cap.read()
            if extra > 0:
                ret, frame = cap.read()
                extra -= 1
        i += 1

    print('Saving gap_info')
    gap_info = {}
    gap_info['indices'] = gap_indices
    gap_info['missing'] = missing

    print('Saving gaps to file')
    with open(gap_path, 'w') as outfile:
        json.dump(gap_info, outfile)
Beispiel #18
0
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 10 21:00:57 2019

@author: tawehbeysolow
"""

import numpy as np
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from algorithms.actor_critic_utilities import train_model
from neural_networks.models import ActorCriticModel

#Parameters
environment = gym_super_mario_bros.make('SuperMarioBros-v0')
environment = BinarySpaceToDiscreteSpaceEnv(environment, SIMPLE_MOVEMENT)
observation = environment.reset()
learning_rate = 1e-4
gamma = 0.96
epsilon = 0.9
n_episodes = 10000
n_steps = 2048
max_steps = int(1e7)
_lambda = 0.95
value_coefficient = 0.5
entropy_coefficient = 0.01
max_grad_norm = 0.5
log_interval = 10

Beispiel #19
0
        return w


def renew_w(preferences, dim):
    w = np.random.randn(reward_size)
    w = np.abs(w) / np.linalg.norm(w, ord=1, axis=0)
    preferences[dim] = w
    return preferences


if __name__ == '__main__':

    args = parser.parse_args()

    # get enviroment information
    env = JoypadSpace(gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT)
    input_size = env.observation_space.shape
    output_size = env.action_space.n
    reward_size = 5

    env.close()

    # setup
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    tag = ["test", "train"][int(args.training)]
    log_dir = os.path.join(
        args.logdir, '{}_{}_{}_{}'.format(args.env_id, args.name, current_time,
                                          tag))
    writer = SummaryWriter(log_dir)

    model_path = 'saved/{}_{}_{}.model'.format(args.env_id, args.name,
Beispiel #20
0
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
import warnings
from helper_file import *

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

warnings.simplefilter("ignore", lineno=148)

current_user = getpass.getuser()
if (current_user == "gryslik"):
    model_path = '/Users/gryslik/gitRepos/qlearning/test_code/mario/models6-DDQN/'
else:
    model_path = '/home/ubuntu/data/code/mario/models6-DDQN/'

env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')  #2-2, 1-1
env = JoypadSpace(env, RIGHT_ONLY)

all_files = os.listdir(model_path)

if "travel_distance.csv" in all_files:
    models_processed = pd.read_csv(model_path +
                                   "travel_distance.csv")['model_name'].values
    models_to_compute = [
        x for x in all_files
        if (x not in models_processed and x not in ".DS_Store"
            and x not in "travel_distance.csv")
    ]
else:
    models_to_compute = [
        item for item in all_files
Beispiel #21
0
 def __init__(self):
     super().__init__()
     self.env = gym_super_mario_bros.make("SuperMarioBros-v0")
     self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT)
     self.history_size = 3
     self.action_repeats = 6
Beispiel #22
0
def createenvironment(enviro, movementset):
    environment = gym_super_mario_bros.make(enviro)
    environment = BinarySpaceToDiscreteSpaceEnv(environment, movementset)

    return environment
Beispiel #23
0
import tensorflow as tf  # Deep Learning library
import numpy as np  # Handle matrices
import random  # used to see if we explore or exploit
import warnings  # This ignore all the warning messages that are normally printed during the training because of skiimage
import os
from collections import deque  # Ordered collection with ends
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros  # import Kautenja's gym environment
from gym_super_mario_bros.actions import RIGHT_ONLY
from skimage import transform  # Help us to preprocess the frames
from skimage.color import rgb2gray  # Help us to gray our frames
warnings.filterwarnings('ignore')  # used to ignore warning messages

# Create our environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')  # Creates the environment
env = BinarySpaceToDiscreteSpaceEnv(
    env, RIGHT_ONLY)  # have to pick complex movement to try different combos

#env.render() # updates the action within the game or pretty much shows you the game is playing

#print("The size of our frame is: ", env.observation_space) # was originally a test to see what this was outputting.
#print("The action size is : ", env.action_space.n)  # the amount of actions we can take in the game

# Here we create an hot encoded version of our actions
# possible_actions = [[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]...]
possible_actions = np.array(
    np.identity(env.action_space.n, dtype=int).tolist())
#print("Possible Actions:", possible_actions)

print("This is the newest version")
Beispiel #24
0
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros

from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-1-1-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)

for _ in range(1000):
    observation = env.reset()
    done = False
    t = 0
    while not done:
        observation, reward, done, info = env.step(env.action_space.sample())
        env.render()
        t += 1
        if not t % 100:
            print(t, info)

env.close()
Beispiel #25
0
        return w


def renew_w(preferences, dim):
    w = np.random.randn(reward_size)
    w = np.abs(w) / np.linalg.norm(w, ord=1, axis=0)
    preferences[dim] = w
    return preferences


if __name__ == '__main__':

    args = parser.parse_args()

    # get enviroment information
    env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(args.env_id),
                                        SIMPLE_MOVEMENT)
    input_size = env.observation_space.shape
    output_size = env.action_space.n
    reward_size = 5

    env.close()

    # setup
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    tag = ["test", "train"][int(args.training)]
    log_dir = os.path.join(
        args.logdir, '{}_{}_{}_{}'.format(args.env_id, args.name, current_time,
                                          tag))
    writer = SummaryWriter(log_dir)
# C:\Users\Micha\Anaconda3\envs\tensorflow\Lib\site-packages\retro
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import numpy as np
import cv2
import neat
import pickle

env = gym_super_mario_bros.make('SuperMarioBros-8-3-v1')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
imgarray = []
xpos_end = 0

resume = True
restore_file = "neat-checkpoint-692"


def eval_genomes(genome, config):
    ob = env.reset()
    ac = env.action_space.sample()

    inx, iny, inc = env.observation_space.shape

    inx = int(inx / 8)
    iny = int(iny / 8)

    net = neat.nn.recurrent.RecurrentNetwork.create(genome, config)

    current_max_fitness = 0
    fitness_current = 0
Beispiel #27
0
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

from utils import process_frame

env = gym_super_mario_bros.make("SuperMarioBros-v1")
env = JoypadSpace(env, SIMPLE_MOVEMENT)

from random import randint
import numpy as np
import os
import tensorflow as tf
from model import generate_model

model_file_path = "./nn_model"
if os.path.exists(model_file_path):
    model = tf.keras.models.load_model(model_file_path)
else:
    img_rows, img_cols = 240, 256
    model = generate_model((img_rows, img_cols, 3), env.action_space.n)

# env.action_space.sample() = numbers, for example, 0,1,2,3...
# state = RGB of raw picture; is a numpy array with shape (240, 256, 3)
# reward = int; for example, 0, 1 ,2, ...
# done = False or True
# info = {'coins': 0, 'flag_get': False, 'life': 3, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40}

done = True
last_state = None
identity = np.identity(
def create_mario_env(env_id,  reward_type):
    env = gym_super_mario_bros.make(env_id)
    env = BinarySpaceToDiscreteSpaceEnv(env, PALETTE_ACTIONS)
    env = wrap_mario(env, reward_type)
    return env
Beispiel #29
0
            state, reward, done, info = env.step(
                env.action_space.sample())  # enter integer between 0 and 11

        # experience replay

        # loop through epochs

        # perform action

        env.render()

    env.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        default='SuperMarioBros-v3',
                        type=str,
                        help='max number of epochs')
    parser.add_argument('-m',
                        default='human',
                        type=str,
                        help='dimensionality of latent space')

    ARGS = parser.parse_args()
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    #env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

    train(env)
Beispiel #30
0
import time

from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT


if __name__ == "__main__":
    for ver in range(1, 4):
        for world in range(1, 9):
            for stage in range(1, 5):
                env = gym_super_mario_bros.make(f'SuperMarioBros-{world}-{stage}-v{ver}')
                env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
                done = True
                for step in range(5):
                    if done:
                        state = env.reset()
                    state, reward, done, info = env.step(env.action_space.sample())
                    env.render()
                time.sleep(1.)
                env.close()