global_step.get())

            writer.add_scalar("step_time", timer.end(), global_step.get())
            writer.add_scalar("episodic_reward", reward, global_step.get())
            writer.add_scalar("episodic_sum_reward", total_reward,
                              global_step.get())
            writer.add_scalar("episode_length", local_step.get(),
                              global_step.get())

        logger.info("Sum reward: {}, episode={}".format(total_reward, episode))

        if global_step.get() > c.ddpg_warmup_steps:
            for i in range(local_step.get()):
                timer.begin()
                ddpg.update(update_policy=i % 2 == 0, update_target=i % 2 == 0)
                ddpg.update_lr_scheduler()
                writer.add_scalar("train_step_time", timer.end(),
                                  global_step.get())

        if render:
            create_gif_subproc(
                frames,
                save_env.get_trial_image_dir() +
                "/{}_{}".format(episode, global_step))

        local_step.reset()
        episode_finished = False
        logger.info("End episode {} at {}".format(
            episode,
            dt.now().strftime("%m/%d-%H:%M:%S")))
                        action_dim] = agents[ag].final_step()

            actions = t.clamp(actions, min=-1, max=1)
            state, reward, episode_finished, info = env.step(
                actions[0].to("cpu"))

            frames.append(env.render(mode="rgb_array"))

            state = t.tensor(state, dtype=t.float32, device=device)
            reward = t.tensor(reward, dtype=t.float32,
                              device=device).unsqueeze(dim=0)

            total_reward += reward

            for agent, r in zip(agents, reward[0]):
                agent.set_reward(r.view(1, 1))

            old_samples = [agent.get_sample() for agent in agents]

        for agent in agents:
            agent.update_history(local_step.get())
            agent.reset_negotiate()
        step_end = time.time()
        logger.info("Step {} completed in {:.3f} s".format(
            local_step, step_end - step_begin))

    create_gif_subproc(frames, "{}/test".format(load_dir))
    episode_end = time.time()
    logger.info("Episode completed in {:.3f} s".format(episode_end -
                                                       episode_begin))
Exemple #3
0
                                       range(first_episode, last_episode + 1)):
            tmp_observe, total_reward, local_step, frames = result
            logger.info("Sum reward: {}, episode={}".format(
                float(total_reward), episode_num))
            writer.add_scalar("episodic_sum_reward", float(total_reward),
                              episode_num)
            writer.add_scalar("episode_length", local_step, episode_num)

            for obsrv in tmp_observe:
                ppo.store_transition(obsrv)

            if len(frames) != 0:
                # sub-processes cannot start a sub-process
                # so we have to store results in the main process
                create_gif_subproc(
                    frames,
                    save_env.get_trial_image_dir() + "/{}".format(episode_num))

            # model serialization
            if episode_num % c.model_save_int == 0:
                ppo.save(save_env.get_trial_model_dir(), version=episode_num)

        logger.info("End episode {}-{} at {}".format(
            first_episode, last_episode,
            dt.now().strftime("%m/%d-%H:%M:%S")))

        # begin training
        timer.begin()
        ppo.update()
        ppo.update_lr_scheduler()
        writer.add_scalar("train_step_time", timer.end(), episode.get())
Exemple #4
0
                    format(local_step, step_end - step_begin, epoch, episode))

            logger.info("Sum reward: {}, epoch={}, episode={}".format(
                t.mean(total_reward), epoch, episode))

            if global_step.get() > ddpg_warmup_steps:
                for i in range(local_step.get()):
                    ddpg_train_begin = time.time()
                    # if using non-batched agents, set concatenate_samples=False
                    ddpg.update(update_policy=i % 2 == 0)
                    ddpg.update_lr_scheduler()
                    ddpg_train_end = time.time()
                    logger.info(
                        "DDPG train Step {} completed in {:.3f} s, epoch={}, episode={}"
                        .format(i, ddpg_train_end - ddpg_train_begin, epoch,
                                episode))

            if render:
                create_gif_subproc(
                    frames,
                    "{}/log/images/{}_{}_{}".format(root_dir, epoch, episode,
                                                    global_step.get()))

            local_step.reset()
            episode_finished = False
            episode_end = time.time()
            logger.info("Episode {} completed in {:.3f} s, epoch={}".format(
                episode, episode_end - episode_begin, epoch))

        episode.reset()