Esempio n. 1
0
def testing_ddpg(args=get_args()):
    env = EnvThreeUsers(args.step_per_epoch)
    args.state_shape = env.observation_space.shape
    args.action_shape = env.action_space.shape
    args.max_action = env.action_space.high[0]
    # model
    net = Net(args.layer_num,
              args.state_shape,
              0,
              device=args.device,
              hidden_layer_size=args.unit_num)
    actor = Actor(net,
                  args.action_shape,
                  args.max_action,
                  args.device,
                  hidden_layer_size=args.unit_num).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              concat=True,
              device=args.device,
              hidden_layer_size=args.unit_num)
    critic = Critic(net, args.device, args.unit_num).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        args.tau,
        args.gamma,
        OUNoise(sigma=args.exploration_noise),
        # GaussianNoise(sigma=args.exploration_noise),
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=True,
        ignore_done=True)
    # restore model
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth')))
    print('\nrelode model!')
    env = EnvThreeUsers(args.step_per_epoch)
    collector = Collector(policy, env)
    ep = 10000
    result = collector.collect(n_episode=ep, render=args.render)
    print('''\nty1_succ_1: {:.6f}, q_len_1: {:.6f},
        \nty1_succ_2: {:.2f}, q_len_2: {:.2f},
        \nty1_succ_3: {:.2f}, q_len_3: {:.2f},
        \nee_1: {:.2f}, ee_2: {:.2f}, ee_3: {:.2f},
        \navg_rate:{:.2f}, \navg_power:{:.2f}\n'''.format(
        result["ty1s_1"][0] / ep, result["ql_1"][0] / ep,
        result["ty1s_2"][0] / ep, result["ql_2"][0] / ep,
        result["ty1s_3"][0] / ep, result["ql_3"][0] / ep,
        result["ee_1"][0] / ep, result["ee_2"][0] / ep, result["ee_3"][0] / ep,
        result["avg_r"] / ep, result["avg_p"] / ep))
    print('large than Qmax: users1: {}, users2: {}, users3: {}.'.format(
        str(env.large_than_Q_1), str(env.large_than_Q_2),
        str(env.large_than_Q_3)))
    collector.close()
Esempio n. 2
0
def offpolicy_trainer(
    policy: BasePolicy,
    train_collector: Collector,
    test_collector: Collector,
    max_epoch: int,
    step_per_epoch: int,
    collect_per_step: int,
    episode_per_test: Union[int, List[int]],
    batch_size: int,
    update_per_step: int = 1,
    train_fn: Optional[Callable[[int], None]] = None,
    test_fn: Optional[Callable[[int], None]] = None,
    stop_fn: Optional[Callable[[float], bool]] = None,
    save_fn: Optional[Callable[[BasePolicy], None]] = None,
    log_fn: Optional[Callable[[dict], None]] = None,
    writer: Optional[SummaryWriter] = None,
    log_interval: int = 1,
    verbose: bool = True,
    # test_in_train: bool = True,
    test_in_train: bool = False,
) -> Dict[str, Union[float, str]]:
    """A wrapper for off-policy trainer procedure.

    :param policy: an instance of the :class:`~tianshou.policy.BasePolicy`
        class.
    :param train_collector: the collector used for training.
    :type train_collector: :class:`~tianshou.data.Collector`
    :param test_collector: the collector used for testing.
    :type test_collector: :class:`~tianshou.data.Collector`
    :param int max_epoch: the maximum of epochs for training. The training
        process might be finished before reaching the ``max_epoch``.
    :param int step_per_epoch: the number of step for updating policy network
        in one epoch.
    :param int collect_per_step: the number of frames the collector would
        collect before the network update. In other words, collect some frames
        and do some policy network update.
    :param episode_per_test: the number of episodes for one policy evaluation.
    :param int batch_size: the batch size of sample data, which is going to
        feed in the policy network.
    :param int update_per_step: the number of times the policy network would
        be updated after frames be collected. In other words, collect some
        frames and do some policy network update.
    :param function train_fn: a function receives the current number of epoch
        index and performs some operations at the beginning of training in this
        epoch.
    :param function test_fn: a function receives the current number of epoch
        index and performs some operations at the beginning of testing in this
        epoch.
    :param function save_fn: a function for saving policy when the undiscounted
        average mean reward in evaluation phase gets better.
    :param function stop_fn: a function receives the average undiscounted
        returns of the testing result, return a boolean which indicates whether
        reaching the goal.
    :param function log_fn: a function receives env info for logging.
    :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard
        SummaryWriter.
    :param int log_interval: the log interval of the writer.
    :param bool verbose: whether to print the information.
    :param bool test_in_train: whether to test in the training phase.

    :return: See :func:`~tianshou.trainer.gather_info`.
    """
    global_step = 0
    best_epoch, best_reward = -1, -1
    stat = {}
    start_time = time.time()
    test_in_train = test_in_train and train_collector.policy == policy
    # change
    training_res = []
    for epoch in range(1, 1 + max_epoch):
        # train
        policy.train()
        if train_fn:
            train_fn(epoch)
        with tqdm.tqdm(total=step_per_epoch,
                       desc=f'Epoch #{epoch}',
                       **tqdm_config) as t:
            while t.n < t.total:
                result = train_collector.collect(n_step=collect_per_step,
                                                 log_fn=log_fn)
                # data = {}
                # if test_in_train and stop_fn and stop_fn(result['rew']):
                #     test_result = test_episode(
                #         policy, test_collector, test_fn,
                #         epoch, episode_per_test)
                #     if stop_fn and stop_fn(test_result['rew']):
                #         if save_fn:
                #             save_fn(policy)
                #         for k in result.keys():
                #             data[k] = f'{result[k]:.2f}'
                #         t.set_postfix(**data)
                #         return gather_info(
                #             start_time, train_collector, test_collector,
                #             test_result['rew'])
                #     else:
                #         policy.train()
                #         if train_fn:
                #             train_fn(epoch)
                for i in range(update_per_step * min(
                        result['n/st'] // collect_per_step, t.total - t.n)):
                    global_step += 1
                    losses = policy.learn(train_collector.sample(batch_size))
                    # for k in result.keys():
                    #     data[k] = f'{result[k]:.2f}'
                    #     if writer and global_step % log_interval == 0:
                    #         writer.add_scalar(
                    #             k, result[k], global_step=global_step)
                    # for k in losses.keys():
                    #     if stat.get(k) is None:
                    #         stat[k] = MovAvg()
                    #     stat[k].add(losses[k])
                    #     data[k] = f'{stat[k].get():.6f}'
                    #     if writer and global_step % log_interval == 0:
                    #         writer.add_scalar(
                    #             k, stat[k].get(), global_step=global_step)
                    t.update(1)
                    # change
                    # t.set_postfix(**data)
            if t.n <= t.total:
                t.update()
        # test
        # change
        if epoch % 50 == 0:  #  or epoch < 2000:
            env = EnvFourUsers(step_per_epoch)
            # env.seed(0)
            policy.train(False)
            collector = Collector(policy, env)
            ep = 100
            result = collector.collect(n_episode=ep)
            # result = test_episode(
            #     policy, test_collector, test_fn, epoch, episode_per_test)
            if best_epoch == -1 or best_reward < result['rew']:
                best_reward = result['rew']
                best_epoch = epoch
                if save_fn:
                    save_fn(policy)
            # print(result)
            if verbose:
                # change
                print(
                    f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, ',
                    f'best_reward: {best_reward:.6f} in #{best_epoch},\n',
                    f'ty1_succ_rate_1: {result["ty1s_1"][0]/ep:.4f}, ',
                    f'ty1_succ_rate_2: {result["ty1s_2"][0]/ep:.4f},  \n',
                    f'ty1_succ_rate_3: {result["ty1s_3"][0]/ep:.4f}, ',
                    f'ty1_succ_rate_4: {result["ty1s_4"][0]/ep:.4f}, \n',
                    f'Q_len_1: {result["ql_1"][0]/ep:.4f},',
                    f'Q_len_2: {result["ql_2"][0]/ep:.4f}, \n',
                    f'Q_len_3: {result["ql_3"][0]/ep:.4f},',
                    f'Q_len_4: {result["ql_4"][0]/ep:.4f}, \n',
                    f'energy_effi_1: {result["ee_1"][0]/ep:.4f},',
                    f'energy_effi_2: {result["ee_2"][0]/ep:.4f},\n',
                    f'energy_effi_3: {result["ee_3"][0]/ep:.4f},',
                    f'energy_effi_4: {result["ee_4"][0]/ep:.4f}\n',
                    f'avg_rate: {result["avg_r"]/ep:.4f}, '
                    f'avg_power: {result["avg_p"]/ep:.4f} dBm\n')
            # change
            training_res.append([
                (result["ee_1"][0] / ep + result["ee_2"][0] / ep +
                 result["ee_3"][0] / ep + result["ee_4"][0] / ep) / 4,
                (result["ty1s_1"][0] / ep + result["ty1s_2"][0] / ep +
                 result["ty1s_3"][0] / ep + result["ty1s_4"][0] / ep) / 4,
                (result["ql_1"][0] / ep + result["ql_2"][0] / ep +
                 result["ql_3"][0] / ep + result["ql_4"][0] / ep) / 4,
                result["rew"]
            ])
        if stop_fn and stop_fn(best_reward):
            break
    # change
    training_res = np.array(training_res)
    wb = Workbook()
    ws = wb.active
    ws.title = 'training result'
    ws['A1'] = 'testing num'
    ws['B1'] = 'energy efficiency'
    ws['C1'] = 'type 1 success rate'
    ws['D1'] = 'type 2 q length'
    ws['E1'] = 'return'
    for i in range(training_res.shape[0]):
        ws.cell(i + 2, 1).value = i + 1
        ws.cell(i + 2, 2).value = training_res[i, 0]
        ws.cell(i + 2, 3).value = training_res[i, 1]
        ws.cell(i + 2, 4).value = training_res[i, 2]
        ws.cell(i + 2, 5).value = training_res[i, 3]
    wb.save("directly_training_slot" + str(step_per_epoch) + ".xlsx")
    test_collector.collect_time = -1
    return gather_info(start_time, train_collector, test_collector,
                       best_reward)
Esempio n. 3
0
def training_ddpg(args=get_args()):
    env = EnvTwoUsers(args.step_per_epoch)
    args.state_shape = env.observation_space.shape
    args.action_shape = env.action_space.shape
    args.max_action = env.action_space.high[0]
    train_envs = VectorEnv([
        lambda: EnvTwoUsers(args.step_per_epoch)
        for _ in range(args.training_num)
    ])
    test_envs = VectorEnv([
        lambda: EnvTwoUsers(args.step_per_epoch) for _ in range(args.test_num)
    ])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)
    # model
    net = Net(args.layer_num,
              args.state_shape,
              device=args.device,
              hidden_layer_size=args.unit_num)
    actor = Actor(net,
                  args.action_shape,
                  args.max_action,
                  args.device,
                  hidden_layer_size=args.unit_num).to(args.device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)
    net = Net(args.layer_num,
              args.state_shape,
              args.action_shape,
              concat=True,
              device=args.device,
              hidden_layer_size=args.unit_num)
    critic = Critic(net, args.device,
                    hidden_layer_size=args.unit_num).to(args.device)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr)
    # orthogonal initialization
    for m in list(actor.modules()) + list(critic.modules()):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.orthogonal_(m.weight)
            torch.nn.init.zeros_(m.bias)
    policy = DDPGPolicy(
        actor,
        actor_optim,
        critic,
        critic_optim,
        args.tau,
        args.gamma,
        OUNoise(sigma=args.exploration_noise),
        # GaussianNoise(sigma=args.exploration_noise),
        [env.action_space.low[0], env.action_space.high[0]],
        reward_normalization=True,
        ignore_done=True)
    # collector
    train_collector = Collector(policy, train_envs,
                                ReplayBuffer(args.buffer_size))
    test_collector = Collector(policy, test_envs)
    # log
    log_path = os.path.join(args.logdir, args.task, 'ddpg')
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    # writer = SummaryWriter(log_path)
    writer = None

    # policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth')))
    # print('reload model!')

    def save_fn(policy):
        torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

    def stop_fn(x):
        return x >= 1e16

    # trainer
    result = offpolicy_trainer(policy,
                               train_collector,
                               test_collector,
                               args.epoch,
                               args.step_per_epoch,
                               args.collect_per_step,
                               args.test_num,
                               args.batch_size,
                               stop_fn=stop_fn,
                               save_fn=save_fn,
                               writer=writer)
    train_collector.close()
    test_collector.close()