def get_dist(history, env, model):
    distances = []

    for i in range(len(history['state_json'])):
        state_json = history['state_json'][i]
        distance = []

        turtle = atari_wrappers.get_turtle(env)
        tb = turtle.toybox
        tb.write_state_json(state_json)

        enemies = state_json['enemies']
        player_index = (state_json['player']['position']['x'],
                        state_json['player']['position']['y'])
        player_pos = world_to_pixels(player_index, tb)
        # print('player pos', player_pos)

        for enemy in enemies:
            # print(enemy)
            enemy_index = (enemy['position']['x'], enemy['position']['y'])
            enemy_pos = world_to_pixels(enemy_index, tb)
            # print('enemy pos', enemy_pos)
            distance.append(
                abs(player_pos[0] - enemy_pos[0]) +
                abs(player_pos[1] - enemy_pos[1]))

        distances += [distance]

    return distances
Example #2
0
def main():
    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)

    logger.configure()

    model, env = train(args, extra_args)
    env.close()

    logger.log("Running trained model")
    env = build_env(args, extra_args)
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    scores = []
    session_scores = set()
    num_games = 0
    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    while num_games < 10:
        actions = model.step(obs)[0]
        num_lives = turtle.ale.lives()
        obs, _, done, info = env.step(actions)
        #done = done and (num_lives == 1 or turtle.ale.game_over())
        #time.sleep(1.0/60.0)
        done = num_lives == 1 and done 
        #done = done.any() if isinstance(done, np.ndarray) else done

        # Make regression testing faster by limiting score.
        # If we earn 500 or so points in any game, we can assume that we've learned something useful.
        if turtle.ale.get_score() > 500:
            done = True

        if isinstance(info, list) or isinstance(info, tuple):
            session_scores.add(np.average([d['score'] for d in info]))
        elif isinstance(info, dict):
            session_scores.add(['score'])
        else:
            session_scores.add(-1)

        if done:
            num_games += 1
            score = max(session_scores)
            scores.append(score)
            session_scores = set()

            print("game %s: %s" % (num_games, score))
            obs = env.reset()
            session_scores = set()


    print("Avg score: %f" % np.average(scores))
    print("Median score: %f" % np.median(scores))
    print("Std error score: %f" % sem(scores))
    print("Std dev score: %f" % stdev(scores))
    env.close()

    # Fail regression test if average is not greater than 100.
    if (np.average(scores) < 50):
        sys.exit(-1)
def get_enemy_saliency(history, env, model, saliency_method):
    saliency = []

    for i in range(len(history['state_json'])):
        print(i)
        state_json = history['state_json'][i]
        frame = history['color_frame'][i]

        #get enemy pixels
        turtle = atari_wrappers.get_turtle(env)
        tb = turtle.toybox
        tb.write_state_json(state_json)
        enemy_pixels = get_concept_pixels_amidar(
            'enemies', state_json, [frame.shape[1], frame.shape[0]], tb)

        #get saliency for each enemy
        #save in right format
        saliency_i = []
        if saliency_method == 'perturbation':
            actor_saliency = score_frame(model,
                                         history,
                                         i,
                                         r=2,
                                         d=5,
                                         interp_func=occlude,
                                         mode='actor')
            S = np.zeros((110, 84))
            S[18:102, :] = actor_saliency
            S = imresize(actor_saliency,
                         size=[frame.shape[0], frame.shape[1]],
                         interp='bilinear').astype(np.float32)

            for enemy in enemy_pixels:
                saliency_enemy_i = []
                for pixels in enemy:
                    saliency_enemy_i.append(S[pixels[1]][pixels[0]])
                saliency_i += [np.mean(saliency_enemy_i)]
            saliency += [saliency_i]
        elif saliency_method == 'object':
            for enemy in enemy_pixels:
                saliency_enemy_i = score_frame_by_pixels(model,
                                                         history,
                                                         i,
                                                         enemy,
                                                         mode='actor')
                saliency_i += [saliency_enemy_i]
            saliency += [saliency_i]
        elif saliency_method == 'jacobian':
            actor_saliency = get_gradients(model,
                                           history['ins'][i],
                                           mode='actor')
            S = np.zeros((110, 84))
            S[18:102, :] = actor_saliency[0, :, :, 3]**2
            S = imresize(actor_saliency[0, :, :, 3]**2,
                         size=[frame.shape[0], frame.shape[1]],
                         interp='bilinear').astype(np.float32)

            for enemy in enemy_pixels:
                saliency_enemy_i = []
                for pixels in enemy:
                    saliency_enemy_i.append(S[pixels[1]][pixels[0]])
                saliency_i += [np.mean(saliency_enemy_i)]
            print(saliency_i)
            saliency += [saliency_i]

    return saliency
Example #4
0
def run_experiment(history, saliency_method='perturbation'):
    print("Setting up trained model")
    env, model = setUp(
        "AmidarToyboxNoFrameskip-v4", "a2c",
        "./models/AmidarToyboxNoFrameskip-v4/amidar4e7_a2c.model")
    env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    saliency_score = {0: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 1: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \
                    2: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 3: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \
                    4: {-2:[], -4:[], 0:[], 4:[], 2:[]}}
    distances = {0: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 1: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \
                    2: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 3: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \
                    4: {-2:[], -4:[], 0:[], 4:[], 2:[]}}

    for i in range(125, len(history['state_json'])):
        state_json = history['state_json'][i]
        frame = history['color_frame'][i]

        #set state to the same state as original game
        tb.write_state_json(state_json)
        enemy_pixels = get_concept_pixels_amidar(
            'enemies', state_json, [frame.shape[1], frame.shape[0]], tb)

        #get saliency
        # S = get_saliency(history, model, i, frame)

        #intervene for each enemy is saliency > 0
        for j, enemy in enumerate(enemy_pixels):
            tb.write_state_json(state_json)
            saliency_orig = get_saliency_on_enemy(
                history,
                model,
                i,
                frame,
                enemy,
                saliency_method=saliency_method)

            if saliency_orig > 0:
                dist_orig = get_dist(state_json, tb, j)

                for k in [-8, -6, 0, 2, 4]:
                    tb.write_state_json(state_json)
                    new_state_json, new_color_frame, new_obs = intervention_move_enemy(
                        state_json, env, model, tb, j, move_step=k)

                    if new_state_json is None:
                        continue

                    if k == 0:
                        saliency_score[j][0].append(saliency_orig)
                        distances[j][0].append(dist_orig)
                        continue

                    plt.imshow(frame)
                    plt.savefig(SAVE_DIR + 'frame{}_e{}'.format(i, j))
                    plt.imshow(new_color_frame)
                    plt.savefig(SAVE_DIR +
                                'frame{}_e{}_intervene{}'.format(i, j, k))

                    saliency = get_saliency_on_enemy(
                        history,
                        model,
                        i,
                        new_color_frame,
                        enemy,
                        inp=new_obs,
                        saliency_method=saliency_method)
                    dist = get_dist(new_state_json, tb, j)

                    if k == -6 or k == -8:
                        saliency_score[j][k + 4].append(saliency)
                        distances[j][k + 4].append(dist)
                    else:
                        saliency_score[j][k].append(saliency)
                        distances[j][k].append(dist)

    return saliency_score, distances
Example #5
0
def main():
    # configure logger, disable logging in child MPI processes (with rank > 0)

    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    model, env = train(args, extra_args)
    env.close()

    if args.save_path is not None and rank == 0:
        save_path = osp.expanduser(args.save_path)
        model.save(save_path)

    if args.play:
        logger.log("Running trained model")
        env = build_env(args, extra_args)
        obs = env.reset()
        turtle = atari_wrappers.get_turtle(env)
        scores = []
        session_scores = set()
        num_games = 0
        # This is a hack to get the starting screen, which throws an error in ALE for amidar
        num_steps = -1

        while num_games < 100:
            actions = model.step(obs)[0]
            num_lives = turtle.ale.lives()
            obs, _, done, info = env.step(actions)
            #done = done and (num_lives == 1 or turtle.ale.game_over())
            #env.render()
            #time.sleep(1.0/60.0)
            done = num_lives == 1 and done
            #done = done.any() if isinstance(done, np.ndarray) else done

            if isinstance(info, list) or isinstance(info, tuple):
                session_scores.add(np.average([d['score'] for d in info]))
            elif isinstance(info, dict):
                session_scores.add(['score'])
            else:
                session_scores.add(-1)

            if done:
                num_games += 1
                score = max(session_scores)
                scores.append(score)
                session_scores = set()

                print("game %s: %s" % (num_games, score))
                obs = env.reset()
                session_scores = set()

        print("Avg score: %f" % np.average(scores))
        print("Median score: %f" % np.median(scores))
        print("Std error score: %f" % sem(scores))
        print("Std dev score: %f" % stdev(scores))
        env.close()
Example #6
0
def single_intervention_move_ball(model,
                                  env,
                                  rollout_history,
                                  max_ep_len=3e3,
                                  move_distance=1,
                                  intervene_step=20):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game
    tb.new_game()
    tb.write_state_json(rollout_history['state_json'][0])

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    while episode_length < intervene_step:
        obs, reward, done, info = env.step(
            rollout_history['actions'][episode_length])
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(rollout_history['a_logits'][episode_length])
        history['values'].append(rollout_history['values'][episode_length])
        history['actions'].append(rollout_history['actions'][episode_length])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)

        episode_length += 1

    print("Intervening on ball now and forward simulating")
    with BreakoutIntervention(tb) as intervention:
        ball_pos = intervention.get_ball_position()
        print("old: ", ball_pos)
        ball_pos['x'] = ball_pos['x'] + move_distance
        ball_pos['y'] = ball_pos['y'] + move_distance
        print("new: ", ball_pos)
        intervention.set_ball_position(ball_pos)
        ball_pos_post = intervention.get_ball_position()
        assert ball_pos_post['x'] == ball_pos['x']

    #forward simulate 3 steps with no-op action
    for i in range(3):
        obs, _, _, _ = env.step(0)

    while not done and episode_length <= max_ep_len:
        episode_length += 1
        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['color_frame'].append(color_frame)
        history['rewards'].append(epr)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #7
0
def single_intervention_move_enemy_back(model, env, rollout_history, enemy_id,
                                        intervening_frame):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #pick random distance to move
    move_step = random.randint(6, 10)

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game
    tb.new_game()
    tb.write_state_json(rollout_history['state_json'][0])

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    #run episode as original until first intervene step
    while episode_length < intervening_frame:
        episode_length += 1
        obs, reward, done, info = env.step(
            rollout_history['actions'][episode_length])
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(rollout_history['a_logits'][episode_length])
        history['values'].append(rollout_history['values'][episode_length])
        history['actions'].append(rollout_history['actions'][episode_length])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)

        episode_length += 1

    #intervene by moving enemy behind
    print("Intervening on enemy position now -- moving {} steps behind".format(
        move_step))
    print("old next step: ",
          state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next'])

    next_step = state_json['enemies'][enemy_id]['ai']['EnemyLookupAI'][
        'next'] - move_step
    if next_step < 0:
        next_step = 0
    state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next'] = next_step

    print("new next step: ",
          state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next'])
    tb.write_state_json(state_json)

    #forward simulate 3 steps with no-op action
    for i in range(3):
        obs, _, _, _ = env.step(0)

    #forward simulate for remainder of episode
    while not done:
        episode_length += 1
        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #8
0
def multiple_intervention_decrement_score(model,
                                          env,
                                          rollout_history,
                                          max_ep_len=3e3,
                                          max_score=3e3):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #pick random decrement size
    decrement_size = random.randint(1, 21)
    print("decrement_size: ", decrement_size)

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game with max_score
    tb.new_game()
    state_json = rollout_history['state_json'][0]
    state_json['score'] = int(max_score)
    tb.write_state_json(state_json)

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    while not done and episode_length <= max_ep_len:
        episode_length += 1
        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        state_json = tb.state_to_json()

        #intervene on score
        # print("Intervening on score now to decrement and forward simulating")
        if state_json['score'] - decrement_size >= 0:
            state_json['score'] -= decrement_size
        else:
            state_json['score'] = 0
        tb.write_state_json(state_json)
        color_frame = tb.get_rgb_frame()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #9
0
def multiple_intervention_modify_score(
        model,
        env,
        rollout_history,
        max_ep_len=3e3,
        abs_score=0,
        intervene_steps=[20, 40, 80, 100, 120, 140],
        random_score=False):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #pick random intervene steps with delta between 5 and 20
    IV_step_delta = random.randint(5, 21)
    intervene_steps = range(IV_step_delta, int(max_ep_len), IV_step_delta)
    print("intervene steps: ", intervene_steps)
    print("rollout_history len: ", len(rollout_history['ins']))

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game
    tb.new_game()
    tb.write_state_json(rollout_history['state_json'][0])

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    #run episode as original until first intervene step
    while episode_length < intervene_steps[0]:
        obs, reward, done, info = env.step(
            rollout_history['actions'][episode_length])
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(rollout_history['a_logits'][episode_length])
        history['values'].append(rollout_history['values'][episode_length])
        history['actions'].append(rollout_history['actions'][episode_length])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)

        episode_length += 1

    #forward simulate and intervene at all consequent intervene steps
    while not done and episode_length <= max_ep_len:
        episode_length += 1
        #intervene
        if episode_length in intervene_steps:
            obs = amidar_modify_score(tb, rollout_history, episode_length,
                                      abs_score, env, random_score)

        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #10
0
def single_intervention_modify_score(model,
                                     env,
                                     rollout_history,
                                     max_ep_len=3e3,
                                     abs_score=0,
                                     intervene_step=20):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game
    tb.new_game()
    tb.write_state_json(rollout_history['state_json'][0])

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    while episode_length < intervene_step:
        obs, reward, done, info = env.step(
            rollout_history['actions'][episode_length])
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(rollout_history['a_logits'][episode_length])
        history['values'].append(rollout_history['values'][episode_length])
        history['actions'].append(rollout_history['actions'][episode_length])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)

        episode_length += 1

    obs = amidar_modify_score(tb, rollout_history, episode_length, abs_score,
                              env)

    while not done and episode_length <= max_ep_len:
        episode_length += 1
        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #11
0
def single_intervention_shift_bricks(model,
                                     env,
                                     rollout_history,
                                     max_ep_len=3e3,
                                     intervene_step=20,
                                     shift_dist=1,
                                     move_ball=False,
                                     move_paddle=False):
    history = {
        'ins': [],
        'a_logits': [],
        'values': [],
        'actions': [],
        'rewards': [],
        'color_frame': [],
        'state_json': []
    }
    episode_length, epr, done = 0, 0, False

    #logger.log("Running trained model")
    print("Running trained model")
    obs = env.reset()
    turtle = atari_wrappers.get_turtle(env)
    tb = turtle.toybox

    #start new game and set start state to the same state as original game
    tb.new_game()
    tb.write_state_json(rollout_history['state_json'][0])

    #add start state to history
    state_json = tb.state_to_json()
    color_frame = tb.get_rgb_frame()
    history['ins'].append(obs)
    history['a_logits'].append(None)
    history['values'].append(None)
    history['actions'].append(None)
    history['rewards'].append(epr)
    history['color_frame'].append(color_frame)
    history['state_json'].append(state_json)
    episode_length += 1

    # This is a hack to get the starting screen, which throws an error in ALE for amidar
    num_steps = -1

    while episode_length < intervene_step:
        obs, reward, done, info = env.step(
            rollout_history['actions'][episode_length])
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(rollout_history['a_logits'][episode_length])
        history['values'].append(rollout_history['values'][episode_length])
        history['actions'].append(rollout_history['actions'][episode_length])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)

        episode_length += 1

    print(
        "Intervening on shifting bricks now and forward simulating with shift distance of {}"
        .format(shift_dist))
    #subtract 216 - x.pos of alive bricks
    with BreakoutIntervention(tb) as intervention:
        bricks = intervention.get_bricks()
        brick_size = (int(bricks[0]['size']['x']), int(bricks[0]['size']['y']))
        bricks_to_flip = []

        #flip only dead bricks in brick + shift_dist in x position
        for i, brick in enumerate(bricks):

            if brick['alive'] is False:
                intervention.set_brick(i)
                shift_xPos = brick['position']['x'] + shift_dist * brick_size[0]
                if shift_xPos > 216:
                    shift_xPos = 12 * ((shift_xPos - 216) / 12)

                for j, brick2 in enumerate(bricks):
                    if brick2['position']['x'] == shift_xPos and brick2[
                            'position']['y'] == brick['position']['y']:
                        #print(brick2)
                        bricks_to_flip.append(j)
                        break
                #print(brick)

        #print(bricks_to_flip)
        for brick_index in bricks_to_flip:
            intervention.set_brick(brick_index, alive=False)

        #move ball to left side of board
        if move_ball:
            move_distance = 70
            ball_pos = intervention.get_ball_position()
            print("old ball pos: ", ball_pos)
            ball_pos['x'] = ball_pos['x'] - move_distance
            ball_pos['y'] = ball_pos['y'] - move_distance
            print("new ball pos: ", ball_pos)
            intervention.set_ball_position(ball_pos)
            ball_pos_post = intervention.get_ball_position()
            assert ball_pos_post['x'] == ball_pos['x']

        #move paddle to left side of board
        if move_paddle:
            pos = intervention.get_paddle_position()
            print("old paddle pos: ", pos)
            pos['x'] = pos['x'] - move_distance
            print("new paddle pos: ", pos)
            intervention.set_paddle_position(pos)

    #forward simulate 3 steps with no-op action
    for i in range(3):
        obs, _, _, _ = env.step(0)

    while not done and episode_length <= max_ep_len:
        episode_length += 1
        actions, value, _, _, a_logits, _ = model.step(obs)
        num_lives = turtle.ale.lives()
        obs, reward, done, info = env.step(actions)
        epr += reward[0]
        color_frame = tb.get_rgb_frame()
        state_json = tb.state_to_json()
        #time.sleep(1.0/60.0)

        #save info
        history['ins'].append(obs)
        history['a_logits'].append(a_logits)
        history['values'].append(value)
        history['actions'].append(actions[0])
        history['rewards'].append(epr)
        history['color_frame'].append(color_frame)
        history['state_json'].append(state_json)
        print('\tstep # {}, reward {:.0f}'.format(episode_length, epr),
              end='\r')

    #check whether output has same length for different keys
    for key in history.keys():
        assert (len(history['ins']) == len(history[key]))

    return history
Example #12
0
def main():
    # configure logger, disable logging in child MPI processes (with rank > 0)
    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        logger.configure()
    else:
        logger.configure(format_strs=[])
        rank = MPI.COMM_WORLD.Get_rank()

    model, env = train(args, extra_args)
    env.close()

    if args.play:
        logger.log("Running trained model")
        env = build_env(args)
        obs = env.reset()
        turtle = atari_wrappers.get_turtle(env)
        found_seed = {}
        seed_state = None

        if not isinstance(turtle, ToyboxBaseEnv):
            raise ValueError(
                "Not a ToyboxBaseEnv; cannot export state to JSON", turtle)
        else:
            if isinstance(turtle, BreakoutEnv):
                found_seed['breakout_bricks_remaining'] = False
                found_seed['breakout_channel_count'] = False

        while not all(found_seed.values()):
            actions = model.step(obs)[0]
            num_lives = turtle.ale.lives()
            obs, _, done, info = env.step(actions)
            done = num_lives == 1 and done

            if isinstance(turtle, AmidarEnv):
                pass

            if isinstance(turtle, BreakoutEnv):
                # find single brick remaining seed
                if turtle.toybox.rstate.breakout_bricks_remaining() == 1:
                    found_seed['breakout_bricks_remaining'] = True
                    save_seed_json('breakout_bricks_remaining',
                                   turtle.toybox.to_json(),
                                   extra_args['load_path'])

                if turtle.toybox.rstate.breakout_channel_count(
                ) and not found_seed['breakout_channel_count'] == 1:
                    found_seed['breakout_channel_count'] = True
                    save_seed_json('breakout_channel_count',
                                   turtle.toybox.to_json(),
                                   extra_args['load_path'])

            if done:
                obs = env.reset()
                print("Game ended before predicate met. New game.")

        env.close()