コード例 #1
0
ファイル: train.py プロジェクト: Xingyu-Lin/softagent
def make_env(args):
    symbolic = args.env_kwargs['observation_mode'] != 'cam_rgb'
    args.encoder_type = 'identity' if symbolic else 'pixel'

    env = Env(args.env_name, symbolic, args.seed, 200, 1, 8, args.im_size, env_kwargs=args.env_kwargs, normalize_observation=False,
              scale_reward=args.scale_reward, clip_obs=args.clip_obs)
    env.seed(args.seed)
    return env
コード例 #2
0
def generate_env_state(env_name):
    kwargs = env_arg_dict[env_name]
    kwargs['headless'] = True
    kwargs['use_cached_states'] = False
    kwargs['num_variations'] = 1000
    kwargs['save_cached_states'] = True
    # Env wrappter
    env = Env(env_name, False, 100, 200, 1, 8, 128, kwargs)
    return env
コード例 #3
0
ファイル: interact.py プロジェクト: zh0123210/continuous-rl
def interact(env: Env, agent: Agent,
             start_obs: Arrayable) -> Tuple[array, array, array]:
    """One step interaction between env and agent.

    :args env: environment
    :args agent: agent
    :args start_obs: initial observation

    :return: (next observation, reward, terminal?)
    """
    action = agent.step(start_obs)
    next_obs, reward, done, information = env.step(action)
    time_limit = information[
        'time_limit'] if 'time_limit' in information else None
    agent.observe(next_obs, reward, done, time_limit)
    return next_obs, reward, done
コード例 #4
0
def main(policy_file, seed, n_test_rollouts, render, exploit, record_video):
    if torch.cuda.is_available():
        device = torch.device('cuda:1')
        torch.cuda.manual_seed(seed)
    else:
        device = torch.device('cpu')
    np.random.seed(seed=seed)
    json_file = os.path.join(os.path.dirname(policy_file), 'variant.json')
    print('Load variants from {}'.format(json_file))
    with open(json_file) as f:
        vv = json.load(f)
    vv['env_kwargs']['headless'] = 1 - render
    vv['saved_models'] = policy_file

    env = Env(vv['env_name'],
              vv['symbolic_env'],
              vv['seed'],
              vv['max_episode_length'],
              vv['action_repeat'],
              vv['bit_depth'],
              env_kwargs=vv['env_kwargs'])
    agent = PlaNetAgent(env, vv, device)

    all_rewards = []
    agent.set_model_eval()
    with torch.no_grad():
        for i in range(n_test_rollouts):
            observation, total_reward = agent.env.reset(), 0
            belief, posterior_state, action = torch.zeros(1, vv['belief_size'], device=device), \
                                              torch.zeros(1, vv['state_size'], device=device), \
                                              torch.zeros(1, env.action_size, device=device)
            for t in range(vv['env_kwargs']['horizon']):
                belief, posterior_state, action, next_observation, reward, done = \
                    agent.update_belief_and_act(agent.env, belief, posterior_state, action, observation.to(device=agent.device),
                                                explore=(exploit != 0))
                total_reward += reward
                observation = next_observation
                if done:
                    break

            print('episode: {}, total reward: {}'.format(i, total_reward))
            all_rewards.append(total_reward)
    print('Average total reward:', np.mean(np.array(all_rewards)))
コード例 #5
0
ファイル: train.py プロジェクト: Xingyu-Lin/softagent
def run_task(arg_vv, log_dir, exp_name):
    if arg_vv['algorithm'] == 'planet':
        from planet.config import DEFAULT_PARAMS
    elif arg_vv['algorithm'] == 'dreamer':
        from dreamer.config import DEFAULT_PARAMS
    else:
        raise NotImplementedError

    vv = DEFAULT_PARAMS
    vv.update(**arg_vv)
    vv = update_env_kwargs(vv)
    vv['max_episode_length'] = vv['env_kwargs']['horizon']

    # Configure logger
    logger.configure(dir=log_dir, exp_name=exp_name)
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Configure torch
    if torch.cuda.is_available():
        device = torch.device('cuda:1') if torch.cuda.device_count(
        ) > 1 else torch.device('cuda:0')
        torch.cuda.manual_seed(vv['seed'])
    else:
        device = torch.device('cpu')

    # Dump parameters
    with open(osp.join(logger.get_dir(), 'variant.json'), 'w') as f:
        json.dump(vv, f, indent=2, sort_keys=True)
    env = Env(vv['env_name'],
              vv['symbolic_env'],
              vv['seed'],
              vv['max_episode_length'],
              vv['action_repeat'],
              vv['bit_depth'],
              vv['image_dim'],
              env_kwargs=vv['env_kwargs'])

    if vv['algorithm'] == 'planet':
        from planet.planet_agent import PlaNetAgent
        agent = PlaNetAgent(env, vv, device)
        agent.train(train_epoch=vv['train_epoch'])
        env.close()
    elif vv['algorithm'] == 'dreamer':
        from dreamer.dreamer_agent import DreamerAgent
        agent = DreamerAgent(env, vv, device)
        agent.train(train_episode=vv['train_episode'])
        env.close()
コード例 #6
0
ファイル: train.py プロジェクト: Xingyu-Lin/softagent
def main(args):
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    args.__dict__ = update_env_kwargs(args.__dict__)  # Update env_kwargs

    symbolic = args.env_kwargs['observation_mode'] != 'cam_rgb'
    args.encoder_type = 'identity' if symbolic else 'pixel'

    env = Env(args.env_name,
              symbolic,
              args.seed,
              200,
              1,
              8,
              args.pre_transform_image_size,
              env_kwargs=args.env_kwargs,
              normalize_observation=False,
              scale_reward=args.scale_reward,
              clip_obs=args.clip_obs)
    env.seed(args.seed)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)

    args.work_dir = logger.get_dir()

    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3, args.image_size, args.image_size)
        pre_aug_obs_shape = (3, args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb, chester_logger=logger)

    episode, episode_reward, done, ep_info = 0, 0, True, []
    start_time = time.time()
    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, video_dir, args.num_eval_episodes, L, step,
                     args)
            if args.save_model and (step % (args.eval_freq * 5) == 0):
                agent.save(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)
        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    for key, val in get_info_stats([ep_info]).items():
                        L.log('train/info_' + key, val, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            ep_info = []
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)
        next_obs, reward, done, info = env.step(action)

        # allow infinit bootstrap
        ep_info.append(info)
        done_bool = 0 if episode_step + 1 == env.horizon else float(done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
コード例 #7
0
def evaluate(
        dt: float,
        epoch: int,
        env: Env,
        agent: Agent,
        eval_gap: float,  # noqa: C901
        time_limit: Optional[float] = None,
        eval_return: bool = False,
        progress_bar: bool = False,
        video: bool = False,
        no_log: bool = False,
        test: bool = False,
        eval_policy: bool = True) -> Optional[float]:
    """Evaluate agent in environment.

    :args dt: time discretization
    :args epoch: index of the current epoch
    :args env: environment
    :args agent: interacting agent
    :args eval_gap: number of normalized epochs (epochs divided by dt)
        between training steps
    :args time_limit: maximal physical time (number of steps divided by dt)
        spent in the environment
    :args eval_return: do we only perform specific evaluation?
    :args progress_bar: use a progress bar?
    :args video: log a video of the interaction?
    :args no_log: do we log results
    :args test: log to a different test summary
    :args eval_policy: if the exploitation policy is noisy,
        remove the noise before evaluating

    :return: return evaluated, None if no return is evaluated
    """
    log_gap = int(eval_gap / dt)
    agent.eval()
    if not eval_policy and isinstance(agent, OnlineAgent):
        agent.noisy_eval()
    agent.reset()
    R = None
    if eval_return:
        rewards, dones = [], []
        imgs = []
        time_limit = time_limit if time_limit else 10
        nb_steps = int(time_limit / dt)
        info(f"eval> evaluating on a physical time {time_limit}"
             f" ({nb_steps} steps in total)")
        obs = env.reset()
        iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps)
        for _ in iter_range:
            obs, reward, done = interact(env, agent, obs)
            rewards.append(reward)
            dones.append(done)
            if video:
                imgs.append(env.render(mode='rgb_array'))
        R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0))
        tag = "noisy" if not eval_policy else ""
        info(f"eval> At epoch {epoch}, {tag} return: {R}")
        if not no_log:
            if not eval_policy:
                log("Return_noisy", R, epoch)
            elif not video:  # don't log when outputing video
                if not test:
                    log("Return", R, epoch)
                else:
                    log("Return_test", R, epoch)
        if video:
            log_video("demo", epoch, np.stack(imgs, axis=0))

    if not no_log:
        specific_evaluation(epoch, log_gap, dt, env, agent)
    return R
コード例 #8
0
    def full_render(self, mode='human'):
        for remote in self.remotes:
            remote.send(('render', None))
        imgs = [remote.recv() for remote in self.remotes]
        bigimg = tile_images(imgs)
        if mode == 'human':
            import cv2
            cv2.imshow('vecenv', bigimg[:, :, ::-1])
            cv2.waitKey(1)
        elif mode == 'rgb_array':
            return bigimg
        else:
            raise NotImplementedError


Env.register(SubprocVecEnv)


def VEnv(envs):
    if len(envs) == 1:
        return SingleVecEnv(envs)
    else:
        return SubprocVecEnv(envs)


if __name__ == '__main__':
    from envs.pusher import DiscretePusherEnv
    nenvs = 64
    envs = [DiscretePusherEnv() for _ in range(nenvs)]
    vec_env = SubprocVecEnv(envs)
コード例 #9
0
def main():
    load_path = [
        # 'data/corl_data/0717_planet_water/0717_planet_water_2020_07_17_03_05_41_0002', #PourWater
        'data/corl_data/0716_planet_cloth/0716_planet_cloth_2020_07_16_18_13_13_0004/', # ClothFlatten
        # './data/corl_data/0723-planet-PassWater/0723-planet-PassWater_2020_07_23_03_11_22_0003',
        # './data/corl_data/0724-planet-TransportTorus/0724-planet-TransportTorus_2020_07_24_03_04_09_0002/'
        # './data/corl_data/0722_planet_rigid_cloth_fold/0722_planet_rigid_cloth_fold_2020_07_22_22_37_24_0003/',  # Rigid Cloth Fold
        # Cloth Fold
        # 'data/corl_data/0719_planet_cloth_fold/0719_planet_cloth_fold_2020_07_19_02_35_15_0002'
        # './data/corl_data/0717_planet_rigid_cloth/0717_planet_rigid_cloth_2020_07_17_21_32_45_0001'  # Rigid Cloth Drop
    ]

    seed = 0
    n_test_rollouts = 8
    render = 0
    save_dir = 'data/planet_open_loop_predictions'
    for path in load_path:
        policy_file = osp.join(path, 'models_550.pth')

        if torch.cuda.is_available():
            device = torch.device('cuda:0')
            torch.cuda.manual_seed(seed)
        else:
            device = torch.device('cpu')
        np.random.seed(seed=seed)
        json_file = os.path.join(os.path.dirname(policy_file), 'variant.json')
        print('Load variants from {}'.format(json_file))
        with open(json_file) as f:
            vv = json.load(f)
        vv['env_kwargs']['headless'] = 1
        vv['saved_models'] = policy_file

        env = Env(vv['env_name'], vv['symbolic_env'], vv['seed'], vv['max_episode_length'], vv['action_repeat'], vv['bit_depth'], vv['image_dim'],
                  env_kwargs=vv['env_kwargs'])
        agent = PlaNetAgent(env, vv, device)

        all_rewards, all_frames, all_frames_reconstr = [], [], []
        agent.set_model_eval()
        with torch.no_grad():
            for i in range(n_test_rollouts):
                observation, total_reward = agent.env.reset(), 0
                belief, posterior_state, action = torch.zeros(1, vv['belief_size'], device=device), \
                                                  torch.zeros(1, vv['state_size'], device=device), \
                                                  torch.zeros(1, env.action_size, device=device)
                initial_belief, initial_posterior, initial_observation = belief.clone(), posterior_state.clone(), observation.clone()
                recorded_actions = [action]
                frames, frames_reconstr = [observation], [observation]
                for t in range(vv['env_kwargs']['horizon']):
                    belief, posterior_state, action, next_observation, reward, done, info = \
                        agent.update_belief_and_act(agent.env, belief, posterior_state, action, observation.to(device=agent.device),
                                                    explore=False)
                    recorded_actions.append(action)
                    total_reward += reward
                    observation = next_observation
                    frames.append(observation)
                    # frames.extend(info['flex_env_recorded_frames'])
                    if done:
                        break

                # Re-imagine without observation
                belief, state = initial_belief, initial_posterior
                for idx, action in enumerate(recorded_actions):
                    print('idx: ', idx)
                    if idx <= 5:
                        belief, _, _, _, state, _, _ = agent.transition_model(state, action.unsqueeze(dim=0), belief,
                                                                              agent.encoder(frames[idx].to(device=agent.device)).unsqueeze(dim=0))
                    else:
                        belief, state, _, _, = agent.transition_model(posterior_state, action.unsqueeze(dim=0), belief)
                    belief, state = belief.squeeze(dim=0), state.squeeze(dim=0)
                    # print('belief size:', belief.size(), 'state size:',  state.size())
                    frames_reconstr.append(agent.observation_model(belief, state).cpu())

                print('episode: {}, total reward: {}'.format(i, total_reward))
                all_rewards.append(total_reward)
                all_frames.append(frames)
                all_frames_reconstr.append(frames_reconstr)

                # Pick key frames
                num_key_frames = 5
                if vv['env_name'] in ['RigidClothDrop', 'ClothDrop']:
                    key_idx = get_spaced_idx(len(frames[:5]), num_key_frames)
                elif vv['env_name'] in ['RigidClothFold', 'ClothFold']:
                    key_idx = get_spaced_idx(len(frames[:15]), num_key_frames)
                else:
                    key_idx = get_spaced_idx(len(frames[:30]), num_key_frames)
                frame = torch.cat([frames[idx] for idx in key_idx], dim=0) + 0.5
                frame_reconstr = torch.cat([frames_reconstr[idx] for idx in key_idx], dim=0) + 0.5
                image_grid = make_grid(torch.cat([frame, frame_reconstr], dim=0), nrow=num_key_frames, pad_value=0.4706, padding=5)
                save_image(image_grid, osp.join(save_dir, vv['env_name'] + '_{}.png'.format(i)))
                # save_image(torch.as_tensor(frame), osp.join(save_dir, vv['env_name'] + '_gt_{}.png'.format(i)))
                # save_image(torch.as_tensor(frame_reconstr), osp.join(save_dir, vv['env_name'] + '_prediction_{}.png'.format(i)))

        for idx in [0, 4]:

            all_frames_ = all_frames[idx:idx + 4]  # Only take the first 8 episodes to visualize
            all_frames_reconstr_ = all_frames_reconstr[idx:idx + 4]
            video_frames = []
            for i in range(len(all_frames_[0])):
                frame = torch.cat([x[i] for x in all_frames_])
                frame_reconstr = torch.cat([x[i] for x in all_frames_reconstr_])
                video_frames.append(make_grid(torch.cat([frame, frame_reconstr], dim=3) + 0.5, nrow=4).numpy())
            print(video_frames[0].shape)
            write_video(video_frames, vv['env_name'] + str(idx), save_dir)  # Lossy compression
            print('Average total reward:', np.mean(np.array(all_rewards)))
コード例 #10
0
ファイル: main.py プロジェクト: Xingyu-Lin/softagent
# Setup
results_dir = os.path.join('results', args.id)
os.makedirs(results_dir, exist_ok=True)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available() and not args.disable_cuda:
    args.device = torch.device('cuda')
    torch.cuda.manual_seed(args.seed)
else:
    args.device = torch.device('cpu')
metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [],
           'observation_loss': [], 'reward_loss': [], 'kl_loss': []}

# Initialise training environment and experience replay memory
env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_dpth)
if args.experience_replay is not '' and os.path.exists(args.experience_replay):
    D = torch.load(args.experience_replay)
    metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list(range(1, D.episodes + 1))
elif not args.test:
    D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.bit_depth,
                         args.device)
    # Initialise dataset D with S random seed episodes
    for s in range(1, args.seed_episodes + 1):
        observation, done, t = env.reset(), False, 0
        while not done:
            action = env.sample_random_action()
            next_observation, reward, done = env.step(action)
            D.append(observation, action, reward, done)
            observation = next_observation
            t += 1