Esempio n. 1
0
        'cpu', torch.uint8).numpy()
    im = Image.fromarray(ndarr)
    return im


episodes = 2
gru_size = 32
bhx_size = 64
ox_size = 100
input_c_features = 8 * 5 * 5
eps = (0, 0)
# hx_ae_model = HxQBNet(gru_size, bhx_size)
ox_ae_best_path = "./resources/pongD_deconv_obs_model_v1.p"

env_name = "PongDeterministic-v4"
env = atari_wrapper(env_name)
obs = env.reset()

ox_ae_model = ConvObsQBNet(len(obs), ox_size)

# initialize visualization app
_, _, obs_data, _ = pickle.loads(
    open("./resources/pongD_bottleneck_data.p", "rb").read())

vis_board = visboard()
vis_board.add_ae(ox_ae_model,
                 obs_data,
                 latent_options={
                     'n': ox_ae_model.latent_size,
                     'min': -1,
                     'max': 1,
    if not os.path.exists('results/'):
        os.mkdir('results/')
    if not os.path.exists('results/' + args.model_type):
        os.mkdir('results/' + args.model_type)

    # start to create models...
    if args.model_type == 'inception':
        model = models.inception_v3(pretrained=True)
    elif args.model_type == 'resnet152':
        model = models.resnet152(pretrained=True)
    elif args.model_type == 'resnet18':
        model = models.resnet18(pretrained=True)
    elif args.model_type == 'vgg19':
        model = models.vgg19_bn(pretrained=True)
    elif args.model_type == 'atari':
        env = atari_wrapper("PongDeterministic-v4")
        env.seed(1)
        obs = env.reset()
        gru_net = GRUNetConv(len(obs), 32, int(env.action_space.n))
        ox_net = ObsQBNet(gru_net.input_c_features, 100)
        model = MMNet(gru_net, None, ox_net)
        model_path = "./pongD_bgru_model.p"
        pretrained_ox_dict = {
            k[8:]: v
            for k, v in torch.load(model_path, map_location='cpu').items()
            if k.startswith("obx_net")
        }
        model.obx_net.load_state_dict(pretrained_ox_dict)
        pretrained_conv_dict = {
            k[8:]: v
            for k, v in torch.load(model_path, map_location='cpu').items()
Esempio n. 3
0
def gather_observations(env_name,
                        gru_size,
                        bhx_size,
                        ox_size,
                        bgru_net_path,
                        device,
                        episodes=1,
                        env_type='atari'):
    if os.path.exists('./inputs/' + str(env_name) + '/observations.pt'):
        observations = torch.load('./inputs/' + str(env_name) +
                                  '/observations.pt',
                                  map_location=device)
        return observations

    if env_type == 'atari':
        env = atari_wrapper(env_name)
        env.seed(0)
        obs = env.reset()
        gru_net = GRUNet(len(obs), gru_size, int(env.action_space.n))
        bhx_net = HxQBNet(gru_size, bhx_size)
        ox_net = ObsQBNet(gru_net.input_c_features, ox_size)
        bgru_net = MMNet(gru_net, bhx_net, ox_net)
    elif env_type == 'classic_control':
        env = gym.make(env_name)
        env.seed(0)
        obs = env.reset()
        gru_net = ControlGRUNet(len(obs), gru_size, int(env.action_space.n))
        bhx_net = ControlHxQBNet(gru_size, bhx_size)
        ox_net = ControlObsQBNet(gru_net.input_c_features, ox_size)
        bgru_net = ControlMMNet(gru_net, bhx_net, ox_net)

    if cuda:
        bgru_net = bgru_net.cuda()

    bgru_net.load_state_dict(torch.load(bgru_net_path, map_location='cpu'))
    bgru_net.eval()
    bgru_net.eval()
    max_actions = 10000
    random.seed(0)
    x = set([])
    observations = []
    with torch.no_grad():
        for ep in range(episodes):
            done = False
            obs = env.reset()
            curr_state = bgru_net.init_hidden()
            if cuda:
                curr_state = curr_state.cuda()
            curr_state_x = bgru_net.state_encode(curr_state)
            ep_reward = 0
            ep_actions = []
            record_changes = []
            while not done:
                # env.render()

                curr_action = bgru_net.get_action_linear(curr_state_x,
                                                         decode=True)
                prob = F.softmax(curr_action, dim=1)
                curr_action = int(prob.max(1)[1].cpu().data.numpy()[0])
                obs = torch.Tensor(obs).unsqueeze(0)
                if cuda:
                    obs = obs.cuda()
                critic, logit, next_state, (next_state_c, next_state_x), (
                    _, obs_x, obs_tanh) = bgru_net((obs, curr_state),
                                                   inspect=True)
                observations.append(obs)
                prob = F.softmax(logit, dim=1)
                next_action = int(prob.max(1)[1].cpu().data.numpy())

                obs, reward, done, _ = env.step(next_action)

                done = done if len(ep_actions) <= max_actions else True
                # a quick hack to prevent the agent from stucking
                max_same_action = 5000
                if len(ep_actions) > max_same_action:
                    actions_to_consider = ep_actions[-max_same_action:]
                    if actions_to_consider.count(
                            actions_to_consider[0]) == max_same_action:
                        done = True
                curr_state = next_state
                curr_state_x = next_state_x

                ep_reward += reward
                x.add(''.join(
                    [str(int(i)) for i in next_state.cpu().data.numpy()[0]]))

    torch.save(observations, './inputs/' + str(env_name) + '/observations.pt')
    return observations
Esempio n. 4
0
    def state_encode(self, state):
        return self.bhx_net.encode(state)

    def obs_encode(self, obs, hx=None):
        if hx is None:
            hx = Variable(torch.zeros(1, self.gru_units))
            if next(self.parameters()).is_cuda:
                hx = hx.cuda()
        _, _, _, (_, _, _, input_x) = self.gru_net((obs, hx), input_fn=self.obx_net, hx_fn=self.bhx_net, inspect=True)
        return input_x


if __name__ == '__main__':
    args = tl.get_args()
    env = atari_wrapper(args.env)
    env.seed(args.env_seed)
    obs = env.reset()

    # create directories to store results
    result_dir = tl.ensure_directory_exits(os.path.join(args.result_dir, 'Atari'))
    env_dir = tl.ensure_directory_exits(os.path.join(result_dir, args.env))

    gru_dir = tl.ensure_directory_exits(os.path.join(env_dir, 'gru_{}'.format(args.gru_size)))
    gru_net_path = os.path.join(gru_dir, 'model.p')
    gru_plot_dir = tl.ensure_directory_exits(os.path.join(gru_dir, 'Plots'))

    bhx_dir = tl.ensure_directory_exits(
        os.path.join(env_dir, 'gru_{}_bhx_{}{}'.format(args.gru_size, args.bhx_size, args.bhx_suffix)))
    bhx_net_path = os.path.join(bhx_dir, 'model.p')
    bhx_plot_dir = tl.ensure_directory_exits(os.path.join(bhx_dir, 'Plots'))