Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained')
    parser.add_argument('-i', '--iteration', type=int, default=0, help='algo iteration')
    parser.add_argument('-s', '--seconds', type=int, default=10, help='testing duration')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    save_path = os.path.join(weight_dir, 'testing_' + str(iteration), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1

    impl = cart_pole_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    actor_net = rslgym_module.MLP([32, 32],
                               nn.Tanh,
                               env.observation_space.shape[0],
                               env.action_space.shape[0])

    actor = rslgym_module.Actor(actor_net,
                             rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0),
                             env.observation_space.shape[0], env.action_space.shape[0],
                             'cpu')

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.seconds/cfg['environment']['control_dt'])

    torch.manual_seed(args.seed)

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    # containers for analysis
    actions = np.zeros(shape=(2, test_steps), dtype=np.float32)
    obs = np.zeros(shape=(4, test_steps), dtype=np.float32)

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, rew, done, info = env.step(act, visualize=cfg['environment']['render'])
            obs[:, i] = ob
            actions[0, i] = info['action']
            actions[1, i] = act

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()

        if cfg['environment']['render']:
            env.wrapper.hideWindow()

        import matplotlib
        matplotlib.use('TKAgg')
        import matplotlib.pyplot as plt

        plt.figure()
        plt.plot(actions[0, :], label='applied action')
        plt.plot(actions[1, :], label='nn action')
        plt.grid()
        plt.legend()

        plt.figure()
        plt.plot(obs[0, :], label='cart pos')
        plt.plot(obs[2, :], label='cart vel')
        plt.grid()
        plt.legend()

        plt.figure()
        plt.plot(obs[1, :], label='pend pos')
        plt.plot(obs[3, :], label='pend vel')
        plt.grid()
        plt.legend()

        plt.show(block=False)
        input('press [ENTER] to exit')
Ejemplo n.º 2
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    # config file arg
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='/cfg.yaml',
                        help='configuration file')
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--demo-record",
                        action="store_true",
                        help="Save video of demo.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--checkpoint-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between saving checkpoint",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    # folder config & logdir
    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name
    log_dir = os.path.join(task_path, 'runs/pfrl_ppo')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__]
    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # environment
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = cart_pole_example_env(
        rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    steps_per_episode = math.floor(cfg['environment']['max_time'] /
                                   cfg['environment']['control_dt'])
    total_steps_per_iteration = steps_per_episode * cfg['environment'][
        'num_envs']
    if total_steps_per_iteration % cfg['algorithm']['num_mini_batches'] > 0.01:
        raise Exception(
            "nminibatches needs to be a multiple of total steps per iteration")

    total_steps_per_minibatch = int(total_steps_per_iteration /
                                    cfg['algorithm']['num_mini_batches'])
    log_interval_steps = total_steps_per_iteration  # log (print to terminal) at every algorithm iteration
    eval_interval_steps = total_steps_per_iteration * 20  # evaluate and record video, update tb,
    total_training_steps = cfg['algorithm'][
        'total_algo_updates'] * total_steps_per_iteration
    checkpoint_save_interval_steps = eval_interval_steps

    print(steps_per_episode)
    print('total_steps_per_iteration: ', total_steps_per_iteration)
    print('total_steps_per_minibatch: ', total_steps_per_minibatch)
    print('log_interval_steps: ', log_interval_steps)
    print('eval_interval_steps: ', eval_interval_steps)
    print('total_training_steps: ', total_training_steps)
    print('checkpoint_save_interval_steps: ', checkpoint_save_interval_steps)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    # actor & critic
    policy = torch.nn.Sequential(
        nn.Linear(env.observation_space.shape[0], 32),
        nn.Tanh(),
        nn.Linear(32, 32),
        nn.Tanh(),
        nn.Linear(32, env.action_space.shape[0]),
        pfrl.policies.GaussianHeadWithStateIndependentCovariance(
            action_size=env.action_space.shape[0],
            var_type="diagonal",
            var_func=lambda x: torch.exp(2 * x),  # Parameterize log std
            var_param_init=0,  # log std = 0 => std = 1
        ),
    )

    vf = torch.nn.Sequential(
        nn.Linear(env.observation_space.shape[0], 32),
        nn.Tanh(),
        nn.Linear(32, 32),
        nn.Tanh(),
        nn.Linear(32, 1),
    )

    def ortho_init(layer, gain):
        nn.init.orthogonal_(layer.weight, gain=gain)
        nn.init.zeros_(layer.bias)

    ortho_init(policy[0], gain=1)
    ortho_init(policy[2], gain=1)
    ortho_init(policy[4], gain=1e-2)
    ortho_init(vf[0], gain=1)
    ortho_init(vf[2], gain=1)
    ortho_init(vf[4], gain=1)

    model = pfrl.nn.Branched(policy, vf)
    opt = torch.optim.Adam(model.parameters(),
                           lr=cfg['algorithm']['learning_rate'],
                           eps=1e-5)

    agent = pfrl.agents.PPO(model,
                            opt,
                            obs_normalizer=None,
                            gpu=args.gpu,
                            value_func_coef=cfg['algorithm']['vf_coef'],
                            update_interval=total_steps_per_iteration,
                            minibatch_size=total_steps_per_minibatch,
                            epochs=cfg['algorithm']['num_epochs'],
                            clip_eps_vf=None,
                            entropy_coef=cfg['algorithm']['ent_coef'],
                            standardize_advantages=True,
                            gamma=cfg['algorithm']['discount_factor'],
                            lambd=cfg['algorithm']['gae_lam'])

    # logger settings
    logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
    logger = logging.getLogger(__name__)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        if cfg['environment']['render']:
            env.show_window()
            if args.demo_record:
                env.start_recording_video(args.load + "/../demo_" +
                                          os.path.basename(args.load) + ".mp4")
        eval_stats = eval_performance_pfrl(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
        )
        if cfg['environment']['render']:
            if args.demo_record:
                env.stop_recording_video()
            env.hide_window()
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        train_agent_batch_with_evaluation_pfrl(
            agent=agent,
            env=env,
            outdir=cfg_saver.data_dir,
            steps=total_training_steps,
            eval_n_steps=steps_per_episode,
            eval_n_episodes=
            None,  # eval_n_steps or eval_n_episodes, one of them must be none!
            eval_interval=eval_interval_steps,  # in timesteps
            log_interval=log_interval_steps,  # in timesteps
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
            use_tensorboard=True,
            checkpoint_freq=checkpoint_save_interval_steps,
            logger=logger)
Ejemplo n.º 3
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    parser = argparse.ArgumentParser()
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-s',
                        '--step_to_load',
                        type=int,
                        default=0,
                        help='step checkpoint to load')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    step_to_load = args.step_to_load

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    save_path = os.path.join(
        weight_dir, 'testing_' + str(step_to_load),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_sac'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1
    cfg['environment']['num_threads'] = 1
    cfg['environment']['control_dt'] = cfg['testing']['control_dt']
    cfg['environment']['render'] = cfg['testing']['render']

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )

    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint')

    if cfg['testing']['render']:
        env.wrapper.showWindow()

    if cfg['testing']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt'])

    torch.manual_seed(cfg['environment']['seed'])

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['testing']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            with agent.eval_mode():
                agent.act_deterministically = True
                act = agent.batch_act(ob)

            ob, rew, done, info = env.step(act,
                                           visualize=cfg['testing']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['testing']['record_video']:
            env.stop_recording_video()
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='/cfg.yaml',
                        help='configuration file')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name

    log_dir = os.path.join(task_path, 'runs/rsl_ppo')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path]
    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = cart_pole_example_env(
        rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    n_steps = math.floor(cfg['environment']['max_time'] /
                         cfg['environment']['control_dt'])

    total_steps_per_episode = n_steps * cfg['environment']['num_envs']

    torch.manual_seed(cfg['environment']['seed'])

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    actor_net = rslgym_module.MLP([32, 32], nn.Tanh, obs_size, action_size,
                                  0.5)
    critic_net = rslgym_module.MLP([32, 32], nn.Tanh, obs_size, 1, 0.5)
    actor_dist = rslgym_module.MultivariateGaussianDiagonalCovariance(
        action_size, 1.0)

    actor = rslgym_module.Actor(actor_net, actor_dist, obs_size, action_size,
                                device)
    critic = rslgym_module.Critic(critic_net, obs_size, device)

    ppo_training = PPO(actor=actor,
                       critic=critic,
                       num_envs=cfg['environment']['num_envs'],
                       num_transitions_per_env=n_steps,
                       num_learning_epochs=cfg['algorithm']['num_epochs'],
                       gamma=cfg['algorithm']['discount_factor'],
                       lam=cfg['algorithm']['gae_lam'],
                       entropy_coef=cfg['algorithm']['ent_coef'],
                       num_mini_batches=cfg['algorithm']['num_mini_batches'],
                       device=device,
                       log_dir=cfg_saver.data_dir,
                       mini_batch_sampling="in_order",
                       learning_rate=cfg['algorithm']['learning_rate'])

    avg_rewards = []
    fig, ax = plt.subplots()
    for update in range(cfg['algorithm']['total_algo_updates']):

        start = time.time()
        obs = env.reset()
        reward_ll_sum = 0
        done_sum = 0
        # just keep the number of consecutive up to the latest "done"
        # can be that one env terminates multiple time, count is reset if done is received
        ep_len = np.zeros(shape=env.num_envs)

        if update % 20 == 0:
            env.show_window()
            env.start_recording_video(cfg_saver.data_dir + "/" + str(update) +
                                      ".mp4")
            for step in range(1 * n_steps):
                action_ll, _ = actor.sample(
                    torch.from_numpy(obs).to(ppo_training.device))
                t = time.time()
                obs, reward_ll, dones, _ = env.step(
                    action_ll.cpu().detach().numpy(), True)
                # print(time.time()-t)

            ppo_training.save_training(cfg_saver.data_dir, update, update)
            obs = env.reset()
            env.stop_recording_video()
            env.hide_window()

        for step in range(n_steps):
            actor_obs = obs
            critic_obs = obs
            action = ppo_training.observe(actor_obs)
            obs, reward, dones, _ = env.step(action, False)
            ep_len[~dones] += 1
            ep_len[dones] = 0
            ppo_training.step(value_obs=critic_obs,
                              rews=reward,
                              dones=dones,
                              infos=[])
            done_sum = done_sum + sum(dones)
            reward_ll_sum = reward_ll_sum + sum(reward)

        ppo_training.update(actor_obs=obs,
                            value_obs=obs,
                            log_this_iteration=update % 10 == 0,
                            update=update)
        end = time.time()

        average_ll_performance = reward_ll_sum / total_steps_per_episode
        average_dones = done_sum / total_steps_per_episode
        avg_rewards.append(average_ll_performance)
        avg_ep_leng = ep_len.mean()

        ppo_training.writer.add_scalar('Policy/average_reward',
                                       average_ll_performance, update)
        ppo_training.writer.add_scalar('Policy/average_dones', average_dones,
                                       update)
        ppo_training.writer.add_scalar('Training/elapsed_time_episode',
                                       end - start, update)
        ppo_training.writer.add_scalar('Training/fps',
                                       total_steps_per_episode / (end - start),
                                       update)
        ppo_training.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng,
                                       update)

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ",
                                    '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("dones: ",
                                    '{:0.6f}'.format(average_dones)))
        print('{:<40} {:>6}'.format("avg_ep_len: ",
                                    '{:0.6f}'.format(avg_ep_leng)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ",
                                    '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format(
            "fps: ",
            '{:6.0f}'.format(total_steps_per_episode / (end - start))))
        print('----------------------------------------------------\n')

        if update > 100 and len(avg_rewards) > 100:
            ax.plot(range(len(avg_rewards)), savgol_filter(avg_rewards, 51, 3))
        else:
            ax.plot(range(len(avg_rewards)), avg_rewards)
        fig.savefig(cfg_saver.data_dir + '/demo.png', bbox_inches='tight')

        ax.clear()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument('--cfg_name', type=str, default='/cfg_trpo.yaml', help='configuration file')
    parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/.." + cfg_name
    log_dir = os.path.join(task_path, 'runs/rsl_trpo')

    save_items = [env_path+'/Environment.hpp',
                  cfg_abs_path,
                  os.path.realpath(__file__)]

    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt'])

    total_steps_per_episode = n_steps * cfg['environment']['num_envs']

    torch.manual_seed(args.seed)

    actor_net = rslgym_module.MLP([256, 128],
                                  nn.Tanh,
                                  env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  init_scale=1.4)

    critic_net = rslgym_module.MLP([256, 128],
                                   nn.Tanh,
                                   env.observation_space.shape[0],
                                   1,
                                   init_scale=1.4)

    actor = rslgym_module.Actor(actor_net,
                  rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0),
                  env.observation_space.shape[0], env.action_space.shape[0],
                  device)

    critic = rslgym_module.Critic(critic_net,
                    env.observation_space.shape[0],
                    device)

    agent = TRPO(
        actor=actor,
        critic=critic,
        num_envs=cfg['environment']['num_envs'],
        num_transitions_per_env=n_steps,
        critic_learning_epochs=cfg['algorithm']['critic_learning']['epochs'],
        critic_learning_rate=cfg['algorithm']['critic_learning']['learning_rate'],
        critic_mini_batches=cfg['algorithm']['critic_learning']['num_mini_batches'],
        max_d_kl=cfg['algorithm']['max_kld'],
        gamma=cfg['algorithm']['discount_factor'],
        lam=cfg['algorithm']['gae_lam'],
        entropy_coef=cfg['algorithm']['entropy_coef'],
        device=device,
        log_dir=cfg_saver.data_dir,
        mini_batch_sampling="in_order"
    )

    avg_rewards = []
    for update in range(cfg['algorithm']['total_algorithm_updates']):

        start = time.time()
        obs = env.reset()

        reward_ll_sum = 0
        ep_len = np.zeros(shape=env.num_envs)
        ep_len_collected = []

        if update % cfg['environment']['eval_every_n'] == 0:
            env.show_window()
            if cfg['environment']['record_video']:
                env.start_recording_video(cfg_saver.data_dir + "/" + str(update) + ".mp4")
            for step in range(n_steps):
                action_ll, _ = actor.sample(torch.from_numpy(obs).to(agent.device))
                obs, reward_ll, dones, info = env.step(action_ll.cpu().detach().numpy(), True)

            agent.save_training(cfg_saver.data_dir, update, update)
            obs = env.reset()
            if cfg['environment']['record_video']:
                env.stop_recording_video()
            env.hide_window()

        for step in range(n_steps):
            actor_obs = obs
            critic_obs = obs
            action = agent.observe(actor_obs)
            obs, reward, dones, info = env.step(action, False)
            agent.step(value_obs=critic_obs, rews=reward, dones=dones, infos=[])
            reward_ll_sum = reward_ll_sum + sum(reward)

            ep_len += 1
            if any(dones):
                ep_len_collected += list(ep_len[dones])
                ep_len[dones] = 0
            if step == n_steps - 1:
                for length in list(ep_len):
                    if length == n_steps:
                        ep_len_collected.append(length)

        agent.update(actor_obs=obs,
                     value_obs=obs,
                     log_this_iteration=update % 10 == 0,
                     update=update)
        end = time.time()
        actor.distribution.enforce_minimum_std((torch.ones(12) * 0.2).to(device))

        average_ll_performance = reward_ll_sum / total_steps_per_episode
        avg_rewards.append(average_ll_performance)
        if len(ep_len_collected)> 0:
            avg_ep_leng = sum(ep_len_collected)/len(ep_len_collected) #incorrect
        else:
            avg_ep_leng = n_steps

        agent.writer.add_scalar('Policy/average_reward', average_ll_performance, update)
        agent.writer.add_scalar('Training/elapsed_time_episode', end - start, update)
        agent.writer.add_scalar('Training/fps', total_steps_per_episode / (end - start), update)
        agent.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng, update)

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ", '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("avg_ep_len: ", '{:0.6f}'.format(avg_ep_leng)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ", '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format("fps: ", '{:6.0f}'.format(total_steps_per_episode / (end - start))))
        print('{:<40} {:>6}'.format("std: ", '{}'.format(actor.distribution.log_std.exp())))
        print('----------------------------------------------------\n')
Ejemplo n.º 6
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    parser = argparse.ArgumentParser()
    parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained')
    parser.add_argument('-s', '--step_to_load', type=int, default=0, help='step checkpoint to load')
    parser.add_argument('-t', '--test_steps', type=int, default=10, help='testing duration in secs')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    step_to_load = args.step_to_load

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    save_path = os.path.join(weight_dir, 'testing_' + str(step_to_load), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1
    cfg['environment']['num_threads'] = 1

    impl = cart_pole_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    steps_per_episode = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt'])
    total_steps_per_iteration = steps_per_episode * cfg['environment']['num_envs']
    if total_steps_per_iteration%cfg['algorithm']['num_mini_batches'] > 0.01:
        raise Exception("nminibatches needs to be a multiple of total steps per iteration")
    total_steps_per_minibatch = int(total_steps_per_iteration/cfg['algorithm']['num_mini_batches'])

    # actor & critic
    policy = torch.nn.Sequential(
        nn.Linear(env.observation_space.shape[0], 32),
        nn.Tanh(),
        nn.Linear(32, 32),
        nn.Tanh(),
        nn.Linear(32, env.action_space.shape[0]),
        pfrl.policies.GaussianHeadWithStateIndependentCovariance(
            action_size=env.action_space.shape[0],
            var_type="diagonal",
            var_func=lambda x: torch.exp(2 * x),  # Parameterize log std
            var_param_init=0,  # log std = 0 => std = 1
        ),
    )

    vf = torch.nn.Sequential(
        nn.Linear(env.observation_space.shape[0], 32),
        nn.Tanh(),
        nn.Linear(32, 32),
        nn.Tanh(),
        nn.Linear(32, 1),
    )

    model = pfrl.nn.Branched(policy, vf)
    opt = torch.optim.Adam(model.parameters(), lr=cfg['algorithm']['learning_rate'], eps=1e-5)

    agent = pfrl.agents.PPO(
        model,
        opt,
        obs_normalizer=None,
        gpu=0,
        value_func_coef=cfg['algorithm']['vf_coef'],
        update_interval=total_steps_per_iteration,
        minibatch_size=total_steps_per_minibatch,
        epochs=cfg['algorithm']['num_epochs'],
        clip_eps_vf=None,
        entropy_coef=cfg['algorithm']['ent_coef'],
        standardize_advantages=True,
        gamma=cfg['algorithm']['discount_factor'],
        lambd=cfg['algorithm']['gae_lam']
    )

    agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint')

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.test_steps/cfg['environment']['control_dt'])

    torch.manual_seed(cfg['environment']['seed'])

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            with agent.eval_mode():
                agent.act_deterministically = True
                act = agent.batch_act(ob)

            ob, rew, done, info = env.step(act, visualize=cfg['environment']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()
Ejemplo n.º 7
0
#!/usr/bin/env python3
import os
import numpy as np
import ruamel.yaml

from rslgym.wrapper import VecEnvPython  # import python wrapper interface
from rslgym_wrapper_anymal import anymal_example_env  # import your environment

task_path = os.path.dirname(os.path.realpath(__file__))
rsc_path = task_path + "/../rsc"
cfg_abs_path = task_path + "/../cfg_ppo.yaml"
cfg = ruamel.yaml.YAML().load(open(cfg_abs_path, 'r'))

dumped_cfg = ruamel.yaml.dump(cfg['environment'], Dumper=ruamel.yaml.RoundTripDumper)
env = VecEnvPython(anymal_example_env(rsc_path, dumped_cfg))

print('action_space ', env.action_space)
print('obs_space ', env.observation_space)
print('num_envs ', env.num_envs)

render = cfg['environment']['render']
if render:
    env.show_window()

obs = env.reset()
info = env.get_info()
# loop for env
for step in range(10000):
    # action = np.zeros((env.num_envs, env.action_space.shape[0])).astype(np.float32)
    action = np.random.randn(env.num_envs, env.action_space.shape[0]).astype(np.float32) * 0.1
    obs, reward, dones, info = env.step(action, visualize=render)
Ejemplo n.º 8
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    # config file arg
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='cfg_sac.yaml',
                        help='configuration file')
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--demo-record",
                        action="store_true",
                        help="Save video of demo.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--checkpoint-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between saving checkpoint",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    # folder config & logdir
    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name
    log_dir = os.path.join(task_path, 'runs/pfrl_sac')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__]
    if not args.demo:
        cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # environment
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    steps_per_episode = math.floor(cfg['environment']['max_time'] /
                                   cfg['environment']['control_dt'])
    total_steps_per_iteration = steps_per_episode * cfg['environment'][
        'num_envs']

    total_training_steps = cfg['algorithm'][
        'total_algorithm_updates'] * total_steps_per_iteration

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0)
    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    # logger settings
    logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
    logger = logging.getLogger(__name__)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        if cfg['environment']['render']:
            env.show_window()
            if args.demo_record:
                env.start_recording_video(args.load + "/../demo_" +
                                          os.path.basename(args.load) + ".mp4")
        eval_stats = eval_performance_pfrl(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
        )
        if cfg['environment']['render']:
            if args.demo_record:
                env.stop_recording_video()
            env.hide_window()
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        train_agent_batch_with_evaluation_pfrl(
            agent=agent,
            env=env,
            outdir=cfg_saver.data_dir,
            steps=total_training_steps,
            eval_n_steps=steps_per_episode,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
            use_tensorboard=True,
            checkpoint_freq=args.checkpoint_interval,
            logger=logger)
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-i',
                        '--iteration',
                        type=int,
                        default=0,
                        help='algo iteration')
    parser.add_argument('-s',
                        '--seconds',
                        type=int,
                        default=10,
                        help='testing duration')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    save_path = os.path.join(
        weight_dir, 'testing_' + str(iteration),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_ppo'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor_net = nn.Sequential(
        rslgym_module.EmpiricalNormalization([obs_size]),
        rslgym_module.MLP([256, 128],
                          nn.Tanh,
                          obs_size,
                          action_size,
                          init_scale=1.4))

    actor = rslgym_module.Actor(
        actor_net,
        rslgym_module.MultivariateGaussianDiagonalCovariance(
            env.action_space.shape[0], 1.0), env.observation_space.shape[0],
        env.action_space.shape[0], 'cpu')

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.seconds / cfg['environment']['control_dt'])

    torch.manual_seed(args.seed)

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, rew, done, info = env.step(
                act, visualize=cfg['environment']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()