Example #1
0
def main():
    args, lr_args, log_dir, preprocess_wrapper = parse_args()
    easy_tf_log.set_dir(log_dir)

    utils_tensorflow.set_random_seeds(args.seed)
    sess = tf.Session()

    envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops,
                     args.n_workers, args.seed, args.debug, log_dir)

    step_counter = utils.TensorFlowCounter(sess)
    update_counter = utils.TensorFlowCounter(sess)
    lr = make_lr(lr_args, step_counter.value)
    optimizer = make_optimizer(lr)

    networks = make_networks(n_workers=args.n_workers,
                             obs_shape=envs[0].observation_space.shape,
                             n_actions=envs[0].action_space.n,
                             value_loss_coef=args.value_loss_coef,
                             entropy_bonus=args.entropy_bonus,
                             max_grad_norm=args.max_grad_norm,
                             optimizer=optimizer,
                             detailed_logs=args.detailed_logs,
                             debug=args.debug)

    global_vars = tf.trainable_variables('global')
    # Why save_relative_paths=True?
    # So that the plain-text 'checkpoint' file written uses relative paths, so that we can restore
    # from checkpoints created on another machine.
    saver = tf.train.Saver(global_vars,
                           max_to_keep=1,
                           save_relative_paths=True)
    if args.load_ckpt:
        print("Restoring from checkpoint '{}'...".format(args.load_ckpt),
              end='',
              flush=True)
        saver.restore(sess, args.load_ckpt)
        print("done!")
    else:
        sess.run(tf.global_variables_initializer())

    workers = make_workers(sess, envs, networks, args.n_workers, log_dir)

    worker_threads = start_worker_threads(workers, args.n_steps,
                                          args.steps_per_update, step_counter,
                                          update_counter)

    run_manager(worker_threads, sess, lr, step_counter, update_counter,
                log_dir, saver, args.manager_wake_interval_seconds,
                args.ckpt_interval_seconds)

    for env in envs:
        env.close()
Example #2
0
def main():
    args, lr_args, log_dir, preprocess_wrapper = parse_args() # parse_args() é importado de params
    easy_tf_log.set_dir(log_dir) # seta o caminho dos logs em easy_ty_log

    utils_tensorflow.set_random_seeds(args.seed) # iniciando a semente aleatóriamente
    sess = tf.Session() # Uma classe para executar operações do TensorFlow. Um Sessionobjeto encapsula o ambiente no qual os Operation objetos são executados e os Tensorobjetos são avaliados. 

    envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops, args.n_workers,
                     args.seed, args.debug, log_dir)

    step_counter = utils.TensorFlowCounter(sess)
    update_counter = utils.TensorFlowCounter(sess)
    lr = make_lr(lr_args, step_counter.value)
    optimizer = make_optimizer(lr)

    # Criando o conjunto de redes por threads
    networks = make_networks(n_workers=args.n_workers, obs_shape=envs[0].observation_space.shape,
                             n_actions=envs[0].action_space.n, value_loss_coef=args.value_loss_coef,
                             entropy_bonus=args.entropy_bonus, max_grad_norm=args.max_grad_norm,
                             optimizer=optimizer, detailed_logs=args.detailed_logs,
                             debug=args.debug)

    # Retorna todas as variáveis ​​criadas com trainable=True.
    # scope: (Opcional.) Uma string. Se fornecida, a lista resultante é filtrada para incluir apenas itens cujo nameatributo corresponde ao scopeuso re.match
    global_vars = tf.trainable_variables('global')


    # Por que save_relative_paths = True?
    # De modo que o arquivo de 'checkpoint' em texto simples use caminhos relativos,
    # para que possamos restaurar a partir de pontos de verificação criados em outra máquina.
    saver = tf.train.Saver(global_vars, max_to_keep=1, save_relative_paths=True)

    # se existir um checkpoint para carregar ele restaura os dados para proceguir de onde parou, caso contrário ele inicia do 0
    if args.load_ckpt:
        print("Restoring from checkpoint '{}'...".format(args.load_ckpt), end='', flush=True)
        saver.restore(sess, args.load_ckpt) # restaura(carrega) a sessão do checkpoint especificado
        print("done!")
    else:
        sess.run(tf.global_variables_initializer())

    # Criando as workes
    workers = make_workers(sess, envs, networks, args.n_workers, log_dir)

    # inicia as threads referente a cada workers criada
    worker_threads = start_worker_threads(workers, args.n_steps, args.steps_per_update,
                                          step_counter, update_counter)

    # Gerenciador de execução das workers_threads
    run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver,
                args.manager_wake_interval_seconds, args.ckpt_interval_seconds)

    for env in envs:
        env.close()
Example #3
0
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    if args.envopt is not None:
        f = open(args.envopt)
        env_options = json.load(f)
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id
    envs = make_envs(
        env_id=env_id,
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
        options=env_options,
    )

    if env_id == "Walker2d-v3":
        healthy_z_range = (0.8, 2.0)
    elif env_id == 'Humanoid-v3':
        healthy_z_range = (1.0, 2.0)
    if 'healthy_z_range' in env_options:
        healthy_z_range = env_options['healthy_z_range']
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)
    if env_id == "Walker2d-v3":
        eval_env = Walker2d_wrapper(eval_env, env_options)

    obs_dim = envs.observation_space.shape[0]
    act_dim = envs.action_space.shape[0]
    real_obs_dim = obs_dim
    real_act_dim = act_dim
    if 'real_obs_dim' in trainer_options:
        real_obs_dim = trainer_options['real_obs_dim']
    if 'real_act_dim' in trainer_options:
        real_act_dim = trainer_options['real_act_dim']
    dim_dict = dict(obs_dim=obs_dim,
                    act_dim=act_dim,
                    real_obs_dim=real_obs_dim,
                    real_act_dim=real_act_dim)

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainer(envs, config, trainer_options)
    else:
        raise NotImplementedError

    # Create a placeholder tensor to help stack frames in 2nd dimension
    # That is turn the observation from shape [num_envs, 1, 84, 84] to
    # [num_envs, 4, 84, 84].
    frame_stack_tensor = FrameStackTensor(num_envs,
                                          envs.observation_space.shape,
                                          config.device)

    # Setup some stats helpers
    episode_rewards = np.zeros([num_envs, 1], dtype=np.float)
    total_episodes = total_steps = iteration = 0
    reward_recorder = deque(maxlen=100)
    episode_length_recorder = deque(maxlen=100)
    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = envs.reset()
    frame_stack_tensor.update(obs)
    trainer.rollouts.observations[0].copy_(
        reduce_shape(frame_stack_tensor.get(), real_obs_dim))
    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            for index in range(config.num_steps):

                trainer.model.eval()
                values, actions, action_log_prob = trainer.model.step(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim))

                cpu_actions = actions.cpu().numpy()
                cpu_actions = enlarge_shape(cpu_actions, act_dim)

                obs, reward, done, info, masks, total_episodes, \
                    total_steps, episode_rewards = step_envs(
                        cpu_actions, envs, episode_rewards, frame_stack_tensor,
                        reward_recorder, episode_length_recorder, total_steps,
                        total_episodes, config.device)

                rewards = torch.from_numpy(reward.astype(np.float32)).view(
                    -1, 1).to(config.device)

                # Store samples
                trainer.rollouts.insert(
                    reduce_shape(frame_stack_tensor.get(), real_obs_dim),
                    actions, action_log_prob, values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                next_value = trainer.compute_values(
                    trainer.rollouts.observations[-1])
            trainer.rollouts.compute_returns(next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            policy_loss, value_loss, total_loss = trainer.update(
                trainer.rollouts)
            trainer.rollouts.after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward=summary(reward_recorder,
                                                "episode_reward"),
                training_episode_length=summary(episode_length_recorder,
                                                "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats=dict(policy_loss=policy_loss,
                                    value_loss=value_loss,
                                    total_loss=total_loss),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
def train(args):
    # Verify algorithm and config
    global env_options, trainer_options
    algo = args.algo
    if algo == "PPO":
        config = ppo_config
    else:
        raise ValueError("args.algo must in [PPO]")
    config.num_envs = args.num_envs
    config.activation = nn.ReLU
    if args.trainopt is not None:
        f = open(args.trainopt)
        trainer_options = json.load(f)
    if args.opt is not None:
        opt = json.load(open(args.opt))
        env_options = opt['env']
        trainer_options = opt['trainer']

    # Seed the environments and setup torch
    seed = args.seed
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_num_threads(1)

    # Clean log directory
    log_dir = verify_log_dir('work_dirs', args.log_dir)

    # Create vectorized environments
    num_envs = args.num_envs
    env_id = args.env_id

    main_envs = make_envs(
        env_id='Humanoid-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    aux_envs = make_envs(
        env_id='Walker2d-v3',
        seed=seed,
        log_dir=log_dir,
        num_envs=num_envs,
        asynchronous=True,
    )

    envs = [main_envs, aux_envs]

    # eval_env is main_env
    healthy_z_range = (1.0, 2.0)
    eval_env = gym.make(env_id,
                        healthy_z_range=healthy_z_range,
                        healthy_reward=0)

    main_obs_dim = 376
    main_act_dim = 17
    main_reduce_obs_dim = 46
    main_reduce_act_dim = 11
    aux_obs_dim = 17
    aux_act_dim = 6

    obs_dims = [main_reduce_obs_dim, aux_obs_dim]
    act_dims = [main_act_dim, aux_act_dim]

    dim_dict = dict(obs_a=main_reduce_obs_dim,
                    act_a=main_reduce_act_dim,
                    obs_b=aux_obs_dim,
                    act_b=aux_act_dim,
                    coeff_a=0.4,
                    coeff_b=1)
    dim_dict['act_dim'] = 17
    dim_dict['real_obs_dim'] = 46

    # Setup trainer
    if algo == "PPO":
        trainer = PPOTrainerMTMT(config, dim_dict)
    else:
        raise NotImplementedError

    frame_stack_tensors = [
        FrameStackTensor(num_envs, main_envs.observation_space.shape,
                         config.device),
        FrameStackTensor(num_envs, aux_envs.observation_space.shape,
                         config.device)
    ]

    # Setup some stats helpers
    episode_rewards = [
        np.zeros([num_envs, 1], dtype=np.float),
        np.zeros([num_envs, 1], dtype=np.float)
    ]

    total_episodes = total_steps = iteration = 0

    reward_recorders = [deque(maxlen=100), deque(maxlen=100)]
    episode_length_recorders = [deque(maxlen=100), deque(maxlen=100)]

    sample_timer = Timer()
    process_timer = Timer()
    update_timer = Timer()
    total_timer = Timer()
    progress = []
    evaluate_stat = {}

    # Start training
    print("Start training!")
    obs = [envs[i].reset() for i in range(2)]
    _ = [frame_stack_tensors[i].update(obs[i]) for i in range(2)]

    # first update
    for i in range(2):
        trainer.rollouts[i].observations[0].copy_(
            reduce_shape(frame_stack_tensors[i].get(), obs_dims[i]))

    branch_names = ['a', 'b']

    while True:  # Break when total_steps exceeds maximum value
        with sample_timer:
            # prepare rollout a
            for ind in range(2):
                for index in range(config.num_steps):
                    trainer.model.eval()
                    values, actions, action_log_prob = trainer.model.step(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]),
                        deterministic=False,
                        branch=branch_names[ind])
                    cpu_actions = actions.cpu().numpy()
                    cpu_actions = enlarge_shape(cpu_actions, act_dims[ind])

                    # obs, done, info not needed, we have masks & obs in frame_stack_tensors
                    _, reward, _, _, masks, new_total_episodes, new_total_steps, episode_rewards[ind] = \
                        step_envs(cpu_actions, envs[ind], episode_rewards[ind], frame_stack_tensors[ind],
                                  reward_recorders[ind], episode_length_recorders[ind],
                                  total_steps, total_episodes, config.device)

                    if ind == 0:
                        total_episodes = new_total_episodes
                        total_steps = new_total_steps

                    rewards = torch.from_numpy(reward.astype(np.float32)).view(
                        -1, 1).to(config.device)

                    trainer.rollouts[ind].insert(
                        reduce_shape(frame_stack_tensors[ind].get(),
                                     obs_dims[ind]), actions, action_log_prob,
                        values, rewards, masks)

        # ===== Process Samples =====
        with process_timer:
            with torch.no_grad():
                for i in range(2):
                    next_value = trainer.compute_values(
                        trainer.rollouts[i].observations[-1], branch_names[i])
                    trainer.rollouts[i].compute_returns(
                        next_value, config.GAMMA)

        trainer.model.train()
        # ===== Update Policy =====
        with update_timer:
            losses = trainer.update(trainer.rollouts[0], trainer.rollouts[1])
            policy_loss, value_loss, total_loss = list(zip(*losses))
            trainer.rollouts[0].after_update()
            trainer.rollouts[1].after_update()

        # ===== Evaluate Current Policy =====
        if iteration % config.eval_freq == 0:
            eval_timer = Timer()
            # seems ok, by default model is dealing with task1
            rewards, eplens = evaluate(trainer, eval_env, 1, dim_dict=dim_dict)
            evaluate_stat = summary(rewards, "episode_reward")
            evaluate_stat.update(summary(eplens, "episode_length"))
            evaluate_stat.update(
                dict(evaluate_time=eval_timer.now,
                     evaluate_iteration=iteration))

        # ===== Log information =====
        if iteration % config.log_freq == 0:
            stats = dict(
                log_dir=log_dir,
                frame_per_second=int(total_steps / total_timer.now),
                training_episode_reward_a=summary(reward_recorders[0],
                                                  "episode_reward"),
                training_episode_length_a=summary(episode_length_recorders[0],
                                                  "episode_length"),
                training_episode_reward_b=summary(reward_recorders[1],
                                                  "episode_reward"),
                training_episode_length_b=summary(episode_length_recorders[1],
                                                  "episode_length"),
                evaluate_stats=evaluate_stat,
                learning_stats_a=dict(policy_loss=policy_loss[0],
                                      value_loss=value_loss[0],
                                      total_loss=total_loss[0]),
                learning_stats_b=dict(policy_loss=policy_loss[1],
                                      value_loss=value_loss[1],
                                      total_loss=total_loss[1]),
                total_steps=total_steps,
                total_episodes=total_episodes,
                time_stats=dict(sample_time=sample_timer.avg,
                                process_time=process_timer.avg,
                                update_time=update_timer.avg,
                                total_time=total_timer.now,
                                episode_time=sample_timer.avg +
                                process_timer.avg + update_timer.avg),
                iteration=iteration)

            progress.append(stats)
            pretty_print({
                "===== {} Training Iteration {} =====".format(algo, iteration):
                stats
            })

        if iteration % config.save_freq == 0:
            trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration))
            progress_path = save_progress(log_dir, progress)
            print(
                "Saved trainer state at <{}>. Saved progress at <{}>.".format(
                    trainer_path, progress_path))

        # [TODO] Stop training when total_steps is greater than args.max_steps
        if total_steps > args.max_steps:
            break
        pass

        iteration += 1

    trainer.save_w(log_dir, "final")
    envs.close()
Example #5
0
def train(cfg):
    print('Start to train ! \n')
    envs = make_envs(num_envs=16, env_name="CartPole-v0")
    state_dim = envs.observation_space.shape[0]
    action_dim = envs.action_space.n
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = A2C(state_dim, action_dim, hidden_dim=256)
    # moving_average_rewards = []
    # ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    state = envs.reset()
    for i_episode in range(1, cfg.train_eps + 1):
        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0
        for i_step in range(1, cfg.train_steps + 1):
            state = torch.FloatTensor(state).to(device)
            dist, value = agent.model(state)
            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
            state = next_state
            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        if i_episode % 20 == 0:
            print("reward", test_env(agent, device='cpu'))
        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = agent.model(next_state)
        returns = agent.compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)
        advantage = returns - values
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        agent.optimizer.zero_grad()
        loss.backward()
        agent.optimizer.step()
    for _ in range(100):
        print("test_reward", test_env(agent, device='cpu'))

        # print('Episode:', i_episode, ' Reward: %i' %
        #       int(ep_reward[0]), 'n_steps:', i_step)
        # ep_steps.append(i_step)
        # rewards.append(ep_reward)
        # if i_episode == 1:
        #     moving_average_rewards.append(ep_reward[0])
        # else:
        #     moving_average_rewards.append(
        #         0.9*moving_average_rewards[-1]+0.1*ep_reward[0])
        # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
        # writer.add_scalar('steps_of_each_episode',
        #                   ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''