コード例 #1
0
def data_func(net, device, train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]
    new_net = common.AtariA2C(envs[0].observation_space.shape,
                              envs[0].action_space.n).to(device)
    new_net.load_state_dict(net)
    agent = ptan.agent.PolicyAgent(lambda x: new_net(x)[0],
                                   device=device,
                                   apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    for exp in exp_source:
        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            train_queue.put(TotalReward(reward=np.mean(new_rewards)))
        train_queue.put(exp)
コード例 #2
0
if __name__ == '__main__':
    mp.set_start_method('spawn')
    parser = argparse.ArgumentParser()
    parser.add_argument('--cuda',
                        default=False,
                        action='store_true',
                        help='Enable cuda')
    parser.add_argument('-n', '--name', required=True, help='Name of the run')
    args = parser.parse_args()  # step 3 parse_args

    device = 'cuda' if args.cuda else 'cpu'

    env = make_env()

    net = common.AtariA2C(env.observation_space.shape,
                          env.action_space.n).to(device)

    net.share_memory()  # ???

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

    train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
    data_proc_list = []

    for proc_idx in range(PROCESSES_COUNT):
        proc_name = '-a3c-grad_' + NAME + '_' + args.name + '#%d' % proc_idx
        data_proc = mp.Process(target=grads_func,
                               args=(proc_name, net, device, train_queue))
        data_proc.start()
        data_proc_list.append(data_proc)
コード例 #3
0
    os.makedirs(saves_path, exist_ok=True)

    envs = [common.make_env() for _ in range(common.NUM_ENVS)]
    test_env = common.make_env(test=True)

    if args.seed:
        common.set_seed(args.seed, envs, cuda=args.cuda)
        suffix = "-seed=%d" % args.seed
    else:
        suffix = ""
    writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix)

    obs_shape = envs[0].observation_space.shape
    act_n = envs[0].action_space.n

    net_policy = common.AtariA2C(obs_shape, act_n).to(device)

    net_em = i2a.EnvironmentModel(obs_shape, act_n)
    net_em.load_state_dict(torch.load(args.em, map_location=lambda storage, loc: storage))
    net_em = net_em.to(device)

    net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS).to(device)
    print(net_i2a)

    obs = envs[0].reset()
    obs_v = ptan.agent.default_states_preprocessor([obs]).to(device)
    res = net_i2a(obs_v)

    optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5)
    policy_opt = optim.Adam(net_policy.parameters(), lr=POLICY_LR)
コード例 #4
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    parser.add_argument("-m", "--model", required=True, help="File with model to load")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    saves_path = os.path.join("saves", "02_env_" + args.name)
    os.makedirs(saves_path, exist_ok=True)

    envs = [common.make_env() for _ in range(NUM_ENVS)]
    writer = SummaryWriter(comment="-02_env_" + args.name)

    net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n)
    net_em = i2a.EnvironmentModel(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
    net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage))
    net = net.to(device)
    print(net_em)
    optimizer = optim.Adam(net_em.parameters(), lr=LEARNING_RATE)

    step_idx = 0
    best_loss = np.inf
    with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker:
        for mb_obs, mb_obs_next, mb_actions, mb_rewards, done_rewards, done_steps in iterate_batches(envs, net, device):
            if len(done_rewards) > 0:
                m_reward = np.mean(done_rewards)
                m_steps = np.mean(done_steps)
                print("%d: done %d episodes, mean reward=%.2f, steps=%.2f" % (
                    step_idx, len(done_rewards), m_reward, m_steps))
コード例 #5
0
    os.makedirs(saves_path, exist_ok=True)

    envs = [common.make_env() for _ in range(common.NUM_ENVS)]
    test_env = common.make_env(test=True)

    if args.seed:
        common.set_seed(args.seed, envs, cuda=args.cuda)
        suffix = "-seed=%d" % args.seed
    else:
        suffix = ""
    writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix)

    obs_shape = envs[0].observation_space.shape
    act_n = envs[0].action_space.n

    net_policy = common.AtariA2C(obs_shape, act_n)

    net_em = i2a.EnvironmentModel(obs_shape, act_n)
    net_em.load_state_dict(
        torch.load(args.em, map_location=lambda storage, loc: storage))

    net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS)

    if args.cuda:
        net_policy.cuda()
        net_em.cuda()
        net_i2a.cuda()

    print(net_i2a)

    obs = envs[0].reset()