def get_dist(history, env, model): distances = [] for i in range(len(history['state_json'])): state_json = history['state_json'][i] distance = [] turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox tb.write_state_json(state_json) enemies = state_json['enemies'] player_index = (state_json['player']['position']['x'], state_json['player']['position']['y']) player_pos = world_to_pixels(player_index, tb) # print('player pos', player_pos) for enemy in enemies: # print(enemy) enemy_index = (enemy['position']['x'], enemy['position']['y']) enemy_pos = world_to_pixels(enemy_index, tb) # print('enemy pos', enemy_pos) distance.append( abs(player_pos[0] - enemy_pos[0]) + abs(player_pos[1] - enemy_pos[1])) distances += [distance] return distances
def main(): arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = parse_cmdline_kwargs(unknown_args) logger.configure() model, env = train(args, extra_args) env.close() logger.log("Running trained model") env = build_env(args, extra_args) obs = env.reset() turtle = atari_wrappers.get_turtle(env) scores = [] session_scores = set() num_games = 0 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while num_games < 10: actions = model.step(obs)[0] num_lives = turtle.ale.lives() obs, _, done, info = env.step(actions) #done = done and (num_lives == 1 or turtle.ale.game_over()) #time.sleep(1.0/60.0) done = num_lives == 1 and done #done = done.any() if isinstance(done, np.ndarray) else done # Make regression testing faster by limiting score. # If we earn 500 or so points in any game, we can assume that we've learned something useful. if turtle.ale.get_score() > 500: done = True if isinstance(info, list) or isinstance(info, tuple): session_scores.add(np.average([d['score'] for d in info])) elif isinstance(info, dict): session_scores.add(['score']) else: session_scores.add(-1) if done: num_games += 1 score = max(session_scores) scores.append(score) session_scores = set() print("game %s: %s" % (num_games, score)) obs = env.reset() session_scores = set() print("Avg score: %f" % np.average(scores)) print("Median score: %f" % np.median(scores)) print("Std error score: %f" % sem(scores)) print("Std dev score: %f" % stdev(scores)) env.close() # Fail regression test if average is not greater than 100. if (np.average(scores) < 50): sys.exit(-1)
def get_enemy_saliency(history, env, model, saliency_method): saliency = [] for i in range(len(history['state_json'])): print(i) state_json = history['state_json'][i] frame = history['color_frame'][i] #get enemy pixels turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox tb.write_state_json(state_json) enemy_pixels = get_concept_pixels_amidar( 'enemies', state_json, [frame.shape[1], frame.shape[0]], tb) #get saliency for each enemy #save in right format saliency_i = [] if saliency_method == 'perturbation': actor_saliency = score_frame(model, history, i, r=2, d=5, interp_func=occlude, mode='actor') S = np.zeros((110, 84)) S[18:102, :] = actor_saliency S = imresize(actor_saliency, size=[frame.shape[0], frame.shape[1]], interp='bilinear').astype(np.float32) for enemy in enemy_pixels: saliency_enemy_i = [] for pixels in enemy: saliency_enemy_i.append(S[pixels[1]][pixels[0]]) saliency_i += [np.mean(saliency_enemy_i)] saliency += [saliency_i] elif saliency_method == 'object': for enemy in enemy_pixels: saliency_enemy_i = score_frame_by_pixels(model, history, i, enemy, mode='actor') saliency_i += [saliency_enemy_i] saliency += [saliency_i] elif saliency_method == 'jacobian': actor_saliency = get_gradients(model, history['ins'][i], mode='actor') S = np.zeros((110, 84)) S[18:102, :] = actor_saliency[0, :, :, 3]**2 S = imresize(actor_saliency[0, :, :, 3]**2, size=[frame.shape[0], frame.shape[1]], interp='bilinear').astype(np.float32) for enemy in enemy_pixels: saliency_enemy_i = [] for pixels in enemy: saliency_enemy_i.append(S[pixels[1]][pixels[0]]) saliency_i += [np.mean(saliency_enemy_i)] print(saliency_i) saliency += [saliency_i] return saliency
def run_experiment(history, saliency_method='perturbation'): print("Setting up trained model") env, model = setUp( "AmidarToyboxNoFrameskip-v4", "a2c", "./models/AmidarToyboxNoFrameskip-v4/amidar4e7_a2c.model") env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox saliency_score = {0: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 1: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \ 2: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 3: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \ 4: {-2:[], -4:[], 0:[], 4:[], 2:[]}} distances = {0: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 1: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \ 2: {-2:[], -4:[], 0:[], 4:[], 2:[]}, 3: {-2:[], -4:[], 0:[], 4:[], 2:[]}, \ 4: {-2:[], -4:[], 0:[], 4:[], 2:[]}} for i in range(125, len(history['state_json'])): state_json = history['state_json'][i] frame = history['color_frame'][i] #set state to the same state as original game tb.write_state_json(state_json) enemy_pixels = get_concept_pixels_amidar( 'enemies', state_json, [frame.shape[1], frame.shape[0]], tb) #get saliency # S = get_saliency(history, model, i, frame) #intervene for each enemy is saliency > 0 for j, enemy in enumerate(enemy_pixels): tb.write_state_json(state_json) saliency_orig = get_saliency_on_enemy( history, model, i, frame, enemy, saliency_method=saliency_method) if saliency_orig > 0: dist_orig = get_dist(state_json, tb, j) for k in [-8, -6, 0, 2, 4]: tb.write_state_json(state_json) new_state_json, new_color_frame, new_obs = intervention_move_enemy( state_json, env, model, tb, j, move_step=k) if new_state_json is None: continue if k == 0: saliency_score[j][0].append(saliency_orig) distances[j][0].append(dist_orig) continue plt.imshow(frame) plt.savefig(SAVE_DIR + 'frame{}_e{}'.format(i, j)) plt.imshow(new_color_frame) plt.savefig(SAVE_DIR + 'frame{}_e{}_intervene{}'.format(i, j, k)) saliency = get_saliency_on_enemy( history, model, i, new_color_frame, enemy, inp=new_obs, saliency_method=saliency_method) dist = get_dist(new_state_json, tb, j) if k == -6 or k == -8: saliency_score[j][k + 4].append(saliency) distances[j][k + 4].append(dist) else: saliency_score[j][k].append(saliency) distances[j][k].append(dist) return saliency_score, distances
def main(): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = parse_cmdline_kwargs(unknown_args) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env = train(args, extra_args) env.close() if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") env = build_env(args, extra_args) obs = env.reset() turtle = atari_wrappers.get_turtle(env) scores = [] session_scores = set() num_games = 0 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while num_games < 100: actions = model.step(obs)[0] num_lives = turtle.ale.lives() obs, _, done, info = env.step(actions) #done = done and (num_lives == 1 or turtle.ale.game_over()) #env.render() #time.sleep(1.0/60.0) done = num_lives == 1 and done #done = done.any() if isinstance(done, np.ndarray) else done if isinstance(info, list) or isinstance(info, tuple): session_scores.add(np.average([d['score'] for d in info])) elif isinstance(info, dict): session_scores.add(['score']) else: session_scores.add(-1) if done: num_games += 1 score = max(session_scores) scores.append(score) session_scores = set() print("game %s: %s" % (num_games, score)) obs = env.reset() session_scores = set() print("Avg score: %f" % np.average(scores)) print("Median score: %f" % np.median(scores)) print("Std error score: %f" % sem(scores)) print("Std dev score: %f" % stdev(scores)) env.close()
def single_intervention_move_ball(model, env, rollout_history, max_ep_len=3e3, move_distance=1, intervene_step=20): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game tb.new_game() tb.write_state_json(rollout_history['state_json'][0]) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while episode_length < intervene_step: obs, reward, done, info = env.step( rollout_history['actions'][episode_length]) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(rollout_history['a_logits'][episode_length]) history['values'].append(rollout_history['values'][episode_length]) history['actions'].append(rollout_history['actions'][episode_length]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 print("Intervening on ball now and forward simulating") with BreakoutIntervention(tb) as intervention: ball_pos = intervention.get_ball_position() print("old: ", ball_pos) ball_pos['x'] = ball_pos['x'] + move_distance ball_pos['y'] = ball_pos['y'] + move_distance print("new: ", ball_pos) intervention.set_ball_position(ball_pos) ball_pos_post = intervention.get_ball_position() assert ball_pos_post['x'] == ball_pos['x'] #forward simulate 3 steps with no-op action for i in range(3): obs, _, _, _ = env.step(0) while not done and episode_length <= max_ep_len: episode_length += 1 actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['color_frame'].append(color_frame) history['rewards'].append(epr) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def single_intervention_move_enemy_back(model, env, rollout_history, enemy_id, intervening_frame): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #pick random distance to move move_step = random.randint(6, 10) #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game tb.new_game() tb.write_state_json(rollout_history['state_json'][0]) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 #run episode as original until first intervene step while episode_length < intervening_frame: episode_length += 1 obs, reward, done, info = env.step( rollout_history['actions'][episode_length]) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(rollout_history['a_logits'][episode_length]) history['values'].append(rollout_history['values'][episode_length]) history['actions'].append(rollout_history['actions'][episode_length]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 #intervene by moving enemy behind print("Intervening on enemy position now -- moving {} steps behind".format( move_step)) print("old next step: ", state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next']) next_step = state_json['enemies'][enemy_id]['ai']['EnemyLookupAI'][ 'next'] - move_step if next_step < 0: next_step = 0 state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next'] = next_step print("new next step: ", state_json['enemies'][enemy_id]['ai']['EnemyLookupAI']['next']) tb.write_state_json(state_json) #forward simulate 3 steps with no-op action for i in range(3): obs, _, _, _ = env.step(0) #forward simulate for remainder of episode while not done: episode_length += 1 actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def multiple_intervention_decrement_score(model, env, rollout_history, max_ep_len=3e3, max_score=3e3): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #pick random decrement size decrement_size = random.randint(1, 21) print("decrement_size: ", decrement_size) #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game with max_score tb.new_game() state_json = rollout_history['state_json'][0] state_json['score'] = int(max_score) tb.write_state_json(state_json) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while not done and episode_length <= max_ep_len: episode_length += 1 actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] state_json = tb.state_to_json() #intervene on score # print("Intervening on score now to decrement and forward simulating") if state_json['score'] - decrement_size >= 0: state_json['score'] -= decrement_size else: state_json['score'] = 0 tb.write_state_json(state_json) color_frame = tb.get_rgb_frame() #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def multiple_intervention_modify_score( model, env, rollout_history, max_ep_len=3e3, abs_score=0, intervene_steps=[20, 40, 80, 100, 120, 140], random_score=False): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #pick random intervene steps with delta between 5 and 20 IV_step_delta = random.randint(5, 21) intervene_steps = range(IV_step_delta, int(max_ep_len), IV_step_delta) print("intervene steps: ", intervene_steps) print("rollout_history len: ", len(rollout_history['ins'])) #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game tb.new_game() tb.write_state_json(rollout_history['state_json'][0]) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 #run episode as original until first intervene step while episode_length < intervene_steps[0]: obs, reward, done, info = env.step( rollout_history['actions'][episode_length]) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(rollout_history['a_logits'][episode_length]) history['values'].append(rollout_history['values'][episode_length]) history['actions'].append(rollout_history['actions'][episode_length]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 #forward simulate and intervene at all consequent intervene steps while not done and episode_length <= max_ep_len: episode_length += 1 #intervene if episode_length in intervene_steps: obs = amidar_modify_score(tb, rollout_history, episode_length, abs_score, env, random_score) actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def single_intervention_modify_score(model, env, rollout_history, max_ep_len=3e3, abs_score=0, intervene_step=20): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game tb.new_game() tb.write_state_json(rollout_history['state_json'][0]) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while episode_length < intervene_step: obs, reward, done, info = env.step( rollout_history['actions'][episode_length]) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(rollout_history['a_logits'][episode_length]) history['values'].append(rollout_history['values'][episode_length]) history['actions'].append(rollout_history['actions'][episode_length]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 obs = amidar_modify_score(tb, rollout_history, episode_length, abs_score, env) while not done and episode_length <= max_ep_len: episode_length += 1 actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def single_intervention_shift_bricks(model, env, rollout_history, max_ep_len=3e3, intervene_step=20, shift_dist=1, move_ball=False, move_paddle=False): history = { 'ins': [], 'a_logits': [], 'values': [], 'actions': [], 'rewards': [], 'color_frame': [], 'state_json': [] } episode_length, epr, done = 0, 0, False #logger.log("Running trained model") print("Running trained model") obs = env.reset() turtle = atari_wrappers.get_turtle(env) tb = turtle.toybox #start new game and set start state to the same state as original game tb.new_game() tb.write_state_json(rollout_history['state_json'][0]) #add start state to history state_json = tb.state_to_json() color_frame = tb.get_rgb_frame() history['ins'].append(obs) history['a_logits'].append(None) history['values'].append(None) history['actions'].append(None) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 # This is a hack to get the starting screen, which throws an error in ALE for amidar num_steps = -1 while episode_length < intervene_step: obs, reward, done, info = env.step( rollout_history['actions'][episode_length]) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #save info history['ins'].append(obs) history['a_logits'].append(rollout_history['a_logits'][episode_length]) history['values'].append(rollout_history['values'][episode_length]) history['actions'].append(rollout_history['actions'][episode_length]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) episode_length += 1 print( "Intervening on shifting bricks now and forward simulating with shift distance of {}" .format(shift_dist)) #subtract 216 - x.pos of alive bricks with BreakoutIntervention(tb) as intervention: bricks = intervention.get_bricks() brick_size = (int(bricks[0]['size']['x']), int(bricks[0]['size']['y'])) bricks_to_flip = [] #flip only dead bricks in brick + shift_dist in x position for i, brick in enumerate(bricks): if brick['alive'] is False: intervention.set_brick(i) shift_xPos = brick['position']['x'] + shift_dist * brick_size[0] if shift_xPos > 216: shift_xPos = 12 * ((shift_xPos - 216) / 12) for j, brick2 in enumerate(bricks): if brick2['position']['x'] == shift_xPos and brick2[ 'position']['y'] == brick['position']['y']: #print(brick2) bricks_to_flip.append(j) break #print(brick) #print(bricks_to_flip) for brick_index in bricks_to_flip: intervention.set_brick(brick_index, alive=False) #move ball to left side of board if move_ball: move_distance = 70 ball_pos = intervention.get_ball_position() print("old ball pos: ", ball_pos) ball_pos['x'] = ball_pos['x'] - move_distance ball_pos['y'] = ball_pos['y'] - move_distance print("new ball pos: ", ball_pos) intervention.set_ball_position(ball_pos) ball_pos_post = intervention.get_ball_position() assert ball_pos_post['x'] == ball_pos['x'] #move paddle to left side of board if move_paddle: pos = intervention.get_paddle_position() print("old paddle pos: ", pos) pos['x'] = pos['x'] - move_distance print("new paddle pos: ", pos) intervention.set_paddle_position(pos) #forward simulate 3 steps with no-op action for i in range(3): obs, _, _, _ = env.step(0) while not done and episode_length <= max_ep_len: episode_length += 1 actions, value, _, _, a_logits, _ = model.step(obs) num_lives = turtle.ale.lives() obs, reward, done, info = env.step(actions) epr += reward[0] color_frame = tb.get_rgb_frame() state_json = tb.state_to_json() #time.sleep(1.0/60.0) #save info history['ins'].append(obs) history['a_logits'].append(a_logits) history['values'].append(value) history['actions'].append(actions[0]) history['rewards'].append(epr) history['color_frame'].append(color_frame) history['state_json'].append(state_json) print('\tstep # {}, reward {:.0f}'.format(episode_length, epr), end='\r') #check whether output has same length for different keys for key in history.keys(): assert (len(history['ins']) == len(history[key])) return history
def main(): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = parse_cmdline_kwargs(unknown_args) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env = train(args, extra_args) env.close() if args.play: logger.log("Running trained model") env = build_env(args) obs = env.reset() turtle = atari_wrappers.get_turtle(env) found_seed = {} seed_state = None if not isinstance(turtle, ToyboxBaseEnv): raise ValueError( "Not a ToyboxBaseEnv; cannot export state to JSON", turtle) else: if isinstance(turtle, BreakoutEnv): found_seed['breakout_bricks_remaining'] = False found_seed['breakout_channel_count'] = False while not all(found_seed.values()): actions = model.step(obs)[0] num_lives = turtle.ale.lives() obs, _, done, info = env.step(actions) done = num_lives == 1 and done if isinstance(turtle, AmidarEnv): pass if isinstance(turtle, BreakoutEnv): # find single brick remaining seed if turtle.toybox.rstate.breakout_bricks_remaining() == 1: found_seed['breakout_bricks_remaining'] = True save_seed_json('breakout_bricks_remaining', turtle.toybox.to_json(), extra_args['load_path']) if turtle.toybox.rstate.breakout_channel_count( ) and not found_seed['breakout_channel_count'] == 1: found_seed['breakout_channel_count'] = True save_seed_json('breakout_channel_count', turtle.toybox.to_json(), extra_args['load_path']) if done: obs = env.reset() print("Game ended before predicate met. New game.") env.close()