Beispiel #1
0
def test(args, shared_model, env_conf):
    #   print('IN TEST')
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    setup_logger('{}_map_log'.format(args.env),
                 r'{0}{1}_map_log'.format(args.log_dir, args.env))
    log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format(
        args.env))

    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    if 'micropolis' in args.env.lower():
        import gym_micropolis
        env = micropolis_env(args.env, env_conf, args)
    else:
        #      print('using atari env for test')
        env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    entropy_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if 'micropolis' in args.env.lower():
        modelInit = getattr(model, args.design_head)
        player.model = modelInit(player.env.observation_space.shape[0],
                                 player.env.action_space,
                                 player.env.env.env.MAP_X)
        player.lstm_sizes = player.model.getMemorySizes()
        if not 'arcade' in args.env.lower():
            player.lstm_size = (1, 16, player.env.env.env.MAP_X,
                                env.env.env.MAP_Y)
    else:
        player.model = A3Clstm(player.env.observation_space.shape[0],
                               player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    i = 0
    while True:

        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward
        entropy_sum += player.entropy.data.item()

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean, entropy_sum))
            import numpy as np
            np.set_printoptions(threshold=400)
            log['{}_map_log'.format(args.env)].info('\n{}'.format(
                np.array2string(
                    np.add(
                        player.env.env.env.micro.map.zoneMap[-1],
                        np.full((player.env.env.env.MAP_X,
                                 player.env.env.env.MAP_Y),
                                2))).replace('\n ',
                                             '').replace('][', ']\n[').replace(
                                                 '[[', '[').replace(']]',
                                                                    ']')))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}best_{1}.dat'.format(args.save_model_dir,
                                                     args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}best_{1}.dat'.format(args.save_model_dir,
                                                 args.env))
            if i % 10 == 0:
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}latest_{1}.dat'.format(args.save_model_dir,
                                                       args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}latest_{1}.dat'.format(args.save_model_dir,
                                                   args.env))
            reward_sum = 0
            entropy_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            i += 1
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Beispiel #2
0
def train(rank, args, shared_model, optimizer, env_conf):
    start_time = time.time()
    ptitle('Training Agent: {}'.format(rank))
    #log = {}

    #setup_logger('{}_train_log'.format(args.env), r'{0}{1}_train_log'.format(
    #    args.log_dir, args.env))
    #log['{}_train_log'.format(args.env)] = logging.getLogger(
    #        '{}_train_log'.format(args.env))

    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    if 'micropolis' in args.env.lower():
        env = micropolis_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if 'micropolis' in args.env.lower():
        modelInit = getattr(model, args.design_head)
        player.model = modelInit(player.env.observation_space.shape[0],
                                 player.env.action_space,
                                 player.env.env.env.MAP_X)
        player.lstm_sizes = player.model.getMemorySizes()
    else:
        player.model = A3Clstm(player.env.observation_space.shape[0],
                               player.env.action_space)
    lstm_size = 512
    if 'micropolis' in args.env.lower():
        if 'arcade' not in args.env.lower():
            lstm_size = (1, 16, env.env.env.MAP_X, env.env.env.MAP_Y)
    player.lstm_size = lstm_size
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    log_counter = 0
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        num_lstm_layers = len(player.lstm_sizes)
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = [
                        Variable(torch.zeros(player.lstm_sizes[i]).cuda())
                        for i in range(num_lstm_layers)
                    ]
                    player.hx = [
                        Variable(torch.zeros(player.lstm_sizes[i]).cuda())
                        for i in range(num_lstm_layers)
                    ]
            else:
                player.cx = [
                    Variable(torch.zeros(lstm_sizes[i]))
                    for i in range(num_lstm_layers)
                ]
                player.hx = [
                    Variable(torch.zeros(lstm_sizes[i]))
                    for i in range(num_lstm_layers)
                ]
        else:
            player.cx = [
                Variable(player.cx[i].data) for i in range(num_lstm_layers)
            ]
            player.hx = [
                Variable(player.hx[i].data) for i in range(num_lstm_layers)
            ]

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break
        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if args.randomize_exploration:
                player.certainty = np.random.uniform(0.5, 1.5)
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            values, logit, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            if values.size()[1] == 1:
                value = values
            else:
                prob = torch.nn.functional.softmax(logit, dim=1)
                action = prob.multinomial(1).data
                value = values[0][action]

            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
                R = Variable(R).cuda()
        else:
            R = Variable(R)
        player.values.append(R)
        policy_loss = 0
        value_loss = 0

        for i in reversed(range(len(player.rewards))):
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.rewards[i] = torch.Tensor([player.rewards[i]
                                                      ]).cuda()
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    gae = Variable(gae.cuda())
            else:
                gae = Variable(gae)
            policy_loss = policy_loss - \
                player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i]

    #if log_counter % 10 == 0:
    #    log['{}_train_log'.format(args.env)].info(
    #            "Time {0}, reward {1}, policy loss {2}, value loss {3}, entropy {4}".
    #            format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
    #                '{:9.2e}'.format(float(sum(player.rewards) / len(player.rewards))),
    #                '{:9.2e}'.format(float(policy_loss.data.item())),
    #                '{:9.2e}'.format(float(value_loss.data.item())),
    #                 '{:10.8e}'.format(float(sum(player.entropies)))))
    #log_counter += 1

        optimizer.zero_grad()
        a3c = args.lmbda * (policy_loss + 0.5 * value_loss)
        a3c.backward()

        torch.nn.utils.clip_grad_norm_(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        player.clear_actions()
Beispiel #3
0
if __name__ == '__main__':
    __spec__ = None
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    if 'micropolis' in args.env.lower():
        env = micropolis_env(args.env, env_conf, args)
        modelInit = getattr(model, args.design_head)
        shared_model = modelInit(env.observation_space.shape[0],
                                 env.action_space, env.env.env.MAP_X)
    else:
        env = atari_env(args.env, env_conf, args)
        shared_model = A3Clstm(env.observation_space.shape[0],
                               env.action_space)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
Beispiel #4
0
    torch.cuda.manual_seed(args.seed)

saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env),
                         map_location=lambda storage, loc: storage)

log = {}
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))
if 'micropolis' in args.env.lower():
    env = micropolis_env("{}".format(args.env), env_conf, args)
#else:
#    env = atari_env("{}".format(args.env), env_conf, args)
num_tests = 0
start_time = time.time()
reward_total_sum = 0
player = Agent(None, env, args, None)
if 'micropolis' in args.env.lower():
    modelInit = getattr(model, args.design_head)
    player.model = modelInit(player.env.observation_space.shape[0],
                             player.env.action_space, player.env.env.env.MAP_X)
    player.lstm_size = player.model.getMemorySizes()
else:
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
player.gpu_id = gpu_id