def eval_db_agent(env, params): if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = VAE(params['state_dim'], params['action_dim']) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent.eval() agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1, params['num_episodes'] + 1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) action, state_val = agent.sample_action_eval(var_state) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Total Time {4}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100, timeSince(start, episode / params['num_episodes']))
def cache_eval_episode(env, params): cache_states, cache_distros = [], [] if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = agent_lookup(params) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) action, state_val, distro = agent.sample_action_distro(var_state) cache_states.append(state) cache_distros.append(distro.cpu().numpy()) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() if episode % params['print_every'] == 0: print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100) cache_states, cache_distros = np.array(cache_states), np.array( cache_distros) pickle.dump((cache_states, cache_distros), open( './out/{0}_{1}_episode.pkl'.format(params['arch'], params['env_name']), 'wb'), -1)
def cache_abstraction(env, params): if os.path.exists('./out/{0}'.format(params['env_name'])): shutil.rmtree('./out/{0}'.format(params['env_name'])) if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = VAE(params['state_dim'], params['action_dim']) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent.eval() agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) # action, state_val = agent.sample_action_eval(var_state) action, state_val, code = agent.sample_action_eval_code(var_state) if not os.path.exists('./out/{0}/{1}'.format(params['env_name'], code)): os.makedirs('./out/{0}/{1}'.format(params['env_name'], code)) preprocessor.get_img_state().save('./out/{0}/{1}/{2}.png'.format(params['env_name'], code, t)) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)
def train_agent(env, params): if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = agent_lookup(params) agent.train() if params['optim'] == 'rms': optimizer = torch.optim.RMSprop(agent.parameters(), lr=params['learning_rate']) elif params['optim'] == 'adam': optimizer = torch.optim.Adam(agent.parameters(), lr=params['learning_rate']) else: print 'Unknown optimizer specified!' sys.exit(0) if params['use_cuda']: agent = agent.cuda() agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1, params['num_episodes'] + 1): env_state = env.reset() episode_reward = 0.0 policy_loss, value_loss = 0.0, 0.0 num_updates = 0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) action, state_val = agent.sample_action(var_state) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break agent.rewards.append(reward) episode_reward += reward if terminal: agent.final_state_val = 0.0 break if t % params['update_freq'] == 0: agent.final_state_val = state_val[0] pl, vl = train_step(agent, optimizer, params) policy_loss += pl value_loss += vl num_updates += 1 episode_rewards.append(episode_reward) agent.final_state_val = 0.0 pl, vl = train_step(agent, optimizer, params) policy_loss += pl value_loss += vl num_updates += 1 agent_steps += t if preprocessor: preprocessor.reset() if params['arch'] in ['VQ-A2C']: visit = len(agent.visited), agent.visited agent.visited = set([]) else: visit = 0 if episode % params['print_every'] == 0: print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Policy Loss {4} | Value Loss {6} | Total Time {5} | S_A {7}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100, policy_loss / num_updates, timeSince(start, episode / params['num_episodes']), value_loss / num_updates, visit)