Ejemplo n.º 1
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.eval()

    for t in itertools.count():
        if player.done:
            player.model.load_state_dict(shared_model.state_dict())

        player.action_test(t)
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args.save_score_level:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save,
                           '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
Ejemplo n.º 2
0
def train(rank, args, shared_model, optimizer, env_conf):

    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)

    tp_weight = args.tp

    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space, args.terminal_prediction,
                           args.reward_prediction)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()

    # Below is where the cores are running episodes continously ...
    average_ep_length = 0

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.eps_len += 1
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        reward_pred_loss = 0
        terminal_loss = 0

        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)  # TODO why this is here?

        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * player.values[
                i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - player.log_probs[i] * Variable(
                gae) - 0.01 * player.entropies[i]

            if args.reward_prediction:
                reward_pred_loss = reward_pred_loss + (
                    player.reward_predictions[i] - player.rewards[i]).pow(2)

        if args.terminal_prediction:  # new way of using emprical episode length as a proxy for current length.
            if player.average_episode_length is None:
                end_predict_labels = np.arange(
                    player.eps_len - len(player.terminal_predictions),
                    player.eps_len) / player.eps_len  # heuristic
            else:
                end_predict_labels = np.arange(
                    player.eps_len - len(player.terminal_predictions),
                    player.eps_len) / player.average_episode_length

            for i in range(len(player.terminal_predictions)):
                terminal_loss = terminal_loss + (
                    player.terminal_predictions[i] -
                    end_predict_labels[i]).pow(2)

            terminal_loss = terminal_loss / len(player.terminal_predictions)

        player.model.zero_grad()
        #print(f"policy loss {policy_loss} and value loss {value_loss} and terminal loss {terminal_loss} and reward pred loss {reward_pred_loss}")

        total_loss = policy_loss + 0.5 * value_loss + tp_weight * terminal_loss + 0.5 * reward_pred_loss

        total_loss.backward()  # will free memory ...

        # Visualize Computation Graph
        #graph = make_dot(total_loss)
        #from graphviz import Source
        #Source.view(graph)

        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        if player.done:
            if player.average_episode_length is None:  # initial one
                player.average_episode_length = player.eps_len
            else:
                player.average_episode_length = int(
                    0.99 * player.average_episode_length +
                    0.01 * player.eps_len)
            #print(player.average_episode_length, 'current one is ', player.eps_len)
            player.eps_len = 0  # reset here
Ejemplo n.º 3
0
env_conf = setup_json["Default"]
for i in setup_json.keys():
    if i in args.env:
        env_conf = setup_json[i]
torch.set_default_tensor_type('torch.FloatTensor')

saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env),
                         map_location=lambda storage, loc: storage)

log = {}
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

env = atari_env("{}".format(args.env), env_conf)
model = A3Clstm(env.observation_space.shape[0], env.action_space)

num_tests = 0
reward_total_sum = 0
player = Agent(model, env, args, state=None)
player.env = gym.wrappers.Monitor(player.env,
                                  "{}_monitor".format(args.env),
                                  force=True)
player.model.eval()
for i_episode in range(args.num_episodes):
    state = player.env.reset()
    player.state = torch.from_numpy(state).float()
    player.eps_len = 0
    reward_sum = 0
    while True:
Ejemplo n.º 4
0
def test(args, shared_model, env_conf, lock, counter):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger(
        '{}_log'.format(args.env),
        r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            with lock:
                counter.value += 1
            log['{}_log'.format(args.env)].info(
                "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}"
                .format(
                    counter.value,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}_{2}.dat'.format(args.save_model_dir,
                                                    args.env, args.log_target))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env,
                                                args.log_target))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Ejemplo n.º 5
0
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            if player.info['ale.lives'] == 0 or player.max_length:
                player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 6
0
        os.mkdir(args.save_model_dir)

    if args.seed:
        torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        if args.seed:
            torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = atari_env(args.env, env_conf, args)
    shared_model = A3Cff(env.observation_space.shape[0], env.action_space)
    if args.load_path:

        saved_state = torch.load(args.load_path,
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)

    shared_model.share_memory()

    if args.optimizer == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
    if args.optimizer == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(),
                               lr=args.lr,
                               amsgrad=args.amsgrad)
Ejemplo n.º 7
0
# left 3 5.
def grayshow(img):
    img = img.squeeze()
    # img = img / 2 + 0.5  # unnormalize
    # npimg = img.numpy()
    plt.imshow(img, cmap='gray')
    plt.show()
if __name__ == '__main__':
    pygame.init()
    screen = pygame.display.set_mode((300, 300))
    pygame.display.set_caption('键盘监听中')
    screen.fill((255, 255, 255))
    pygame.key.set_repeat(70)
    pygame.display.flip()

    env = atari_env(args.env, env_conf, args)  # gym.make("SpaceInvaders-v0")

    # action1 = env.action_space.sample()
    # print(env.action_space.n)
    # while action1 in [0,2,3,4,5]:
    #     action1 = env.action_space.sample()
    # print(action1)
    init_log = 3
    while True:
        trace_s = []
        trace_a = []
        s = env.reset()
        action = key_action[NO]
        while True:
            # grayshow(s)
            trace_s.append(s)
Ejemplo n.º 8
0
def train(rank, reward_type, args, shared_model, optimizer, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    env.seed(args.seed + rank)

    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None, reward_type)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    for i in itertools.count():
        if i % 10 == 0:
            print("reward type {0}, iter {1}".format(reward_type, i))
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            reward_sum += player.reward
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 9
0
def train_robust(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    if args.seed:
        torch.manual_seed(args.seed + rank)
        if gpu_id >= 0:
            torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    if args.seed:
        env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Cff(player.env.observation_space.shape[0],
                         player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        p = optimizer.param_groups[0]['params'][0]
        step = optimizer.state[p]['step']
        if step >= (args.total_frames / args.num_steps):
            return
        #increase linearly until 2/3 through halfway
        lin_coeff = min(1, (1.5 * int(step) + 1) /
                        (args.total_frames / args.num_steps))
        epsilon = lin_coeff * args.epsilon_end
        kappa = args.kappa_end  #(1-lin_coeff)*1 + lin_coeff*args.kappa_end
        for step in range(args.num_steps):
            player.action_train(bound_epsilon=epsilon)
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _ = player.model(Variable(player.state.unsqueeze(0)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]

            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            if gae >= 0:
                worst_case_loss = -player.min_log_probs[i] * Variable(gae)
            else:
                worst_case_loss = -player.max_log_probs[i] * Variable(gae)
            standard_loss = -player.log_probs[i] * Variable(gae)

            policy_loss = policy_loss + kappa * standard_loss + (
                1 - kappa) * worst_case_loss - 0.01 * player.entropies[i]
        #print(policy_loss + 0.5 * value_loss)
        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        player.clear_actions()
Ejemplo n.º 10
0
def test(args, shared_models, env_conf):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(env, args, gpu_id)
    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    prev_reward = 0
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.models[0].load_state_dict(
                        shared_models[0].state_dict())
                    player.models[1].load_state_dict(
                        shared_models[1].state_dict())
            else:
                player.models[0].load_state_dict(shared_models[0].state_dict())
                player.models[1].load_state_dict(shared_models[1].state_dict())
            player.models[0].eval()
            player.models[1].eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))
            with open('./results', 'a') as f:
                line = f"{reward_total_sum - prev_reward}\n"
                f.write(line)
                prev_reward = reward_total_sum
            player.episodic_reward = 0
            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.models[0].state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}_early.dat'.format(args.save_model_dir,
                                                      args.env))
                        state_to_save = player.models[1].state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}_late.dat'.format(args.save_model_dir,
                                                     args.env))
                else:
                    state_to_save = player.models[0].state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}_early.dat'.format(args.save_model_dir,
                                                  args.env))
                    state_to_save = player.models[1].state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}_late.dat'.format(args.save_model_dir,
                                                 args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Ejemplo n.º 11
0
def train(rank, args, shared_model, optimizer):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]

    writer = SummaryWriter(log_dir=args.log_dir + 'tb_train')
    log = {}
    setup_logger('{}_train_log'.format(rank),
                 r'{0}{1}_train_log'.format(args.log_dir, rank))
    log['{}_train_log'.format(rank)] = logging.getLogger(
        '{}_train_log'.format(rank))
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(env_id=rank, args=args, type='train')
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id

    player.model = A3Clstm(player.env.observation_space.shape[2],
                           player.env.action_space.n)

    player.state = player.env.reset()
    player.state = normalize_rgb_obs(player.state)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    num_trains = 0

    if not os.path.exists(args.log_dir + "images/"):
        os.makedirs(args.log_dir + "images/")

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()

            if player.done:
                break

        if player.done:
            num_trains += 1
            log['{}_train_log'.format(rank)].info('entropy:{0}'.format(
                player.entropy.data[0]))
            writer.add_scalar("data/entropy_" + str(rank),
                              player.entropy.data[0], num_trains)
            writer.add_image('FCN_' + str(rank), player.fcn, num_trains)
            writer.add_image('Depth_GroundTruth_' + str(rank), player.depth,
                             num_trains)
            writer.add_image('RGB_' + str(rank), player.env.get_rgb(),
                             num_trains)

            save_image(
                player.fcn.data, args.log_dir + "images/" + str(rank) + "_" +
                str(num_trains) + "_fcn.png")
            # print("player.fcn.data:", player.fcn.data)
            save_image(
                player.depth.data, args.log_dir + "images/" + str(rank) + "_" +
                str(num_trains) + "_depth.png")
            cv2.imwrite(
                args.log_dir + "images/" + str(rank) + "_" + str(num_trains) +
                "_rgb.png", player.env.get_rgb())
            # print("player.depth.data:", player.depth.data)

            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            state = normalize_rgb_obs(state)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            with torch.cuda.device(gpu_id):
                value, _, _, _ = player.model(
                    (Variable(player.state.unsqueeze(0)), (player.hx,
                                                           player.cx),
                     Variable(
                         torch.from_numpy(player.env.target).type(
                             torch.FloatTensor).cuda())))
                R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = args.gamma * player.values[
                i + 1].data + player.rewards[i] - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            # policy_loss =  policy_loss - \
            #     player.log_probs[i] * \
            #     Variable(gae) - 0.01 * player.entropies[i] \
            #     + player.fcn_losses[i] # FCN

            policy_loss =  policy_loss - 1e-5*(player.log_probs[i] * Variable(gae)) - 1e-5*(0.01 * player.entropies[i]) \
                + player.fcn_losses[i] * DEPTH_LOSS_DISCOUNT # FCN

            # policy_loss = policy_loss + player.fcn_losses[i]  # FCN

        writer.add_scalar("data/value_loss_" + str(rank), value_loss,
                          num_trains)
        writer.add_scalar("data/policy_loss_" + str(rank), policy_loss,
                          num_trains)

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 12
0
def test(args, shared_model, optimizer, env_conf):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    start_time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
    log = {}
    
    setup_logger('{}_log'.format(args.env), r'{0}{1}_{2}_log'.format(
        args.log_dir, args.env, start_time))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))
    if not os.path.exists(args.save_model_dir):
        os.mkdir(args.save_model_dir)
    if args.seed:
        torch.manual_seed(args.seed)
        if gpu_id >= 0:
            torch.cuda.manual_seed(args.seed)
            
    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Cff(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = -10000
    
    while True:
        p = optimizer.param_groups[0]['params'][0]
        step = optimizer.state[p]['step']
        player.model.eval()
        
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            
            flag = False
        
        with torch.no_grad():
            if args.robust:
                #player.action_test_losses(args.epsilon_end)
                lin_coeff = min(1, (1.5*int(step)+1)/(args.total_frames/args.num_steps))
                epsilon = lin_coeff*args.epsilon_end
                player.action_train(epsilon)
            else:
                player.action_train()
                #player.action_test_losses()
            
        reward_sum += player.noclip_reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            # calculate losses for tracking
            R = torch.zeros(1, 1)
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    R = R.cuda()
            player.values.append(R)
            gae = torch.zeros(1, 1)
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    gae = gae.cuda()
            R = Variable(R)
            
            standard_loss = 0
            worst_case_loss = 0
            value_loss = 0
            entropy = 0
            
            for i in reversed(range(len(player.rewards))):
                R = args.gamma * R + player.rewards[i]
                advantage = R - player.values[i]

                value_loss += 0.5 * advantage.pow(2)

                # Generalized Advantage Estimataion
                delta_t = player.rewards[i] + args.gamma * \
                    player.values[i + 1].data - player.values[i].data
                
                gae = gae * args.gamma * args.tau + delta_t
                if args.robust:
                    if advantage >= 0:
                        worst_case_loss += - player.min_log_probs[i] * Variable(gae)
                    else:
                        worst_case_loss += - player.max_log_probs[i] * Variable(gae)
                        
                standard_loss += -player.log_probs[i] * Variable(gae)
                entropy += player.entropies[i]
            
            standard_loss = standard_loss/len(player.rewards)
            worst_case_loss = worst_case_loss/len(player.rewards)
            value_loss = value_loss/len(player.rewards)
            entropy = entropy/len(player.rewards)
            player.clear_actions()
            
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                ("Time {0}, steps {1}/{2}, ep reward {3}, ep length {4}, reward mean {5:.3f} \n"+
                "Losses: Policy:{6:.3f}, Worst case: {7:.3f}, Value: {8:.3f}, Entropy: {9:.3f}").
                format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start)),
                    int(step), args.total_frames/args.num_steps, reward_sum, player.eps_len, reward_mean,
                      float(standard_loss), float(worst_case_loss), float(value_loss), float(entropy)))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(state_to_save, '{0}{1}_{2}_best.pt'.format(
                            args.save_model_dir, args.env, start_time))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(state_to_save, '{0}{1}_{2}_best.pt'.format(
                        args.save_model_dir, args.env, start_time))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            
            #stop after total steps gradient updates have passed
            if step >= args.total_frames/args.num_steps:
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(state_to_save, '{0}{1}_{2}_last.pt'.format(
                            args.save_model_dir, args.env, start_time))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(state_to_save, '{0}{1}_{2}_last.pt'.format(
                        args.save_model_dir, args.env, start_time))
                return
            
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Ejemplo n.º 13
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)

    state = env.reset()
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(model, env, args, state)
    player.state = torch.from_numpy(state).float()
    player.model.eval()
    while True:
        if player.starter and player.flag:
            player = player_start(player)
        else:
            player.flag = False
        if player.done and not player.flag:
            player.model.load_state_dict(shared_model.state_dict())
            player.cx = Variable(torch.zeros(1, 512), volatile=True)
            player.hx = Variable(torch.zeros(1, 512), volatile=True)
            player.flag = False
        elif not player.flag:
            player.cx = Variable(player.cx.data, volatile=True)
            player.hx = Variable(player.hx.data, volatile=True)
            player.flag = False
        if not player.flag:
            player, reward = player_act(player, train=False)
            reward_sum += reward

        if not player.done:
            if player.current_life > player.info['ale.lives']:
                player.flag = True
                player.current_life = player.info['ale.lives']
            else:
                player.current_life = player.info['ale.lives']
                player.flag = False

        if player.done:
            num_tests += 1
            player.current_life = 0
            player.flag = True
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args.save_score_level:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save,
                           '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
Ejemplo n.º 14
0
def train(rank, args, shared_models, optimizers, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    env.seed(args.seed + rank)
    player = Agent(env, args, gpu_id)
    player.rank = rank
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
    player.models[0].train()
    player.models[1].train()
    player.eps_len += 2
#    player.test_models()
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                # player.model.load_state_dict(shared_model.state_dict())
                player.models[0].load_state_dict(shared_models[0].state_dict())
                player.models[1].load_state_dict(shared_models[1].state_dict())
        else:
            # player.model.load_state_dict(shared_model.state_dict())
            player.models[0].load_state_dict(shared_models[0].state_dict())
            player.models[1].load_state_dict(shared_models[1].state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        # if rank == 0:
        #     print(player.episodic_reward)
        player.episodic_reward = 0

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.models[player.curr_model_id]((Variable(player.state.unsqueeze(0)),
                                        (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        # player.values.append(Variable(R))
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        # print("Length of values vector", len(player.values))
        # print("Length of rewards vector", len(player.rewards))
        # print("Length of model sequence vector", len(player.model_sequence))
        next_val = Variable(R)
        last_val = next_val
        R_vec = [Variable(R), Variable(R)]
        # last_id = player.model_sequence[-1]
        active_flags = [False, False]
        policy_loss = [0, 0]
        value_loss = [0, 0]
        for reward, value, model_id, log_prob, entropy in zip(
                reversed(player.rewards),
                reversed(player.values),
                reversed(player.model_sequence),
                reversed(player.log_probs),
                reversed(player.entropies)
        ):
            active_flags[model_id] = True
            R_vec[model_id] = args.gamma * R_vec[model_id] + reward
            R_vec[(model_id+1)%2] *= args.gamma

            advantage = R_vec[model_id] - value
            value_loss[model_id] += 0.5 * advantage.pow(2)

            delta_t = reward + args.gamma * next_val.data - value.data
            gae = gae * args.gamma * args.tau + delta_t
            policy_loss[model_id] -= (log_prob * Variable(gae) + 0.01 * entropy)

            next_val = value

        try:
            if active_flags[0] is True:
                player.models[0].zero_grad()
                (policy_loss[0] + 0.5 * value_loss[0]).backward()
                ensure_shared_grads(player.models[0], shared_models[0], gpu = gpu_id >= 0)
                optimizers[0].step()
            if active_flags[1] is True:
                player.models[1].zero_grad()
                (policy_loss[1] + 0.5 * value_loss[1]).backward()
                ensure_shared_grads(player.models[1], shared_models[1], gpu = gpu_id >= 0)
                optimizers[1].step()
        except Exception as e:
            print("Exception caught. Ignoring")
            if rank == 1:
                print(rewards)
                print(model_sequence)
        player.clear_actions()
    torch.cuda.manual_seed(args.seed)

saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env),
                         map_location=lambda storage, loc: storage)

log = {}
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

env = atari_env(env_id=0, args=args, type='train')
num_tests = 0
reward_total_sum = 0
player = Agent(None, env, args, None)
player.model = A3Clstm(player.env.observation_space.shape[2],
                       player.env.action_space.n)

player.model.load_state_dict(saved_state)

if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()

# player.env = gym.wrappers.Monitor(
#     player.env, "{}_monitor".format(args.env), force=True)
player.model.eval()
Ejemplo n.º 16
0
# Based on
# https://github.com/pytorch/examples/tree/master/mnist_hogwild
# Training settings
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior
if __name__ == '__main__':
    # --------设置global随机种子和多线程---------
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    # --------gym环境预处理---------------
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    envs = [atari_env(args.env, env_conf, args, rank) for rank in range(args.workers)]
    observation_space, action_space = envs[0].observation_space.shape[0], envs[0].action_space
    # -------公用的lstm神经网络,load参数
    envs = ParallelEnv(envs)

    train(args, envs, observation_space, action_space)








Ejemplo n.º 17
0
def train(rank, args, shared_model, optimizer, env_conf, emb, bi_grams, instructions):
    # Changes the process name
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)

    # Define special vectors
    eos_vector = emb.get_vector("<eos>")
    oov_vector = emb.get_vector("<oov>")

    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
    env.seed(args.seed + rank)

    # Create agent
    player = Agent(None, env, args, None, emb)
    player.gpu_id = gpu_id

    # Create DNN model for the agent
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space, emb)

    # Set env and move to gpu
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()

    # Set model to "training" mode. Not doing anything but is a good practice to add
    player.model.train()

    # Start iteration
    player.eps_len += 2

    _counter = 0
    while True:

        # Loading param values from shared model
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        # Reset LSTM state when episode ends
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, args.lstm_size).cuda())
                    player.hx = Variable(torch.zeros(1, args.lstm_size).cuda())
            else:
                player.cx = Variable(torch.zeros(1, args.lstm_size))
                player.hx = Variable(torch.zeros(1, args.lstm_size))

        # If not ended, save current state value
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        # Make a step and record observations. Repeat until num_steps reached or game is over.
        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        # If episode finished before args.num_steps is reached, reset environment
        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        # If episode not finished after args.num_steps:
        # Estimates value function of current state
        R = torch.zeros(1, 1)
        if not player.done:
            _, value, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                                        (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        # Append reward for the final time step
        player.values.append(Variable(R))

        # Initialise loss accumulator
        policy_loss = 0
        value_loss = 0
        language_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)

        # Accumulate the losses
        for i in reversed(range(len(player.rewards))):

            # Calculating language loss
            if args.use_language:

                # Calculating language loss
                # Get action of a time step
                a = np.argmax(player.action_logits[i].detach().cpu().numpy())

                # Get produced vectors of the time step
                produced_logits = player.produced_logits[i]
                # print(produced_vectors)
                # Get target vectors of the time step (an instruction corresponding to the least cost)
                action_instructions = instructions[a]

                # Sample a few from the set
                for _ in range(10):
                    idx = random.randrange(0, len(action_instructions))
                    instruction = action_instructions[idx]


                    target_words = instruction.split()

                    for pos, target_word in enumerate(target_words):
                        target_class = torch.tensor(emb.get_index(target_word)).cuda()
                        produced_logit = produced_logits[pos]

                        # Cross_entropy combines log-softmax and nll
                        # Here procuded_vec is one-hot while target is an integer
                        language_loss += torch.nn.functional.cross_entropy(produced_logit, target_class.unsqueeze(0))
                        if target_word == '<eos>':
                            break


            # Calculate other losses
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]

            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]


        # Initialise grad accumulator
        player.model.zero_grad()

        # Calculate grad and update
        if args.use_language:
            (policy_loss + 0.5 * value_loss + 0.1 * 0.01* language_loss).backward()
        else:
            (policy_loss + 0.5 * value_loss).backward()

        """
        # (policy_loss + 0.5 * value_loss).backward()
        print("****************")
        print(policy_loss)
        print(value_loss)
        # """
        if args.use_language and _counter % 10 == 0:
            print("****************")
            #print(policy_loss)
            #print(value_loss)
            print("language loss", language_loss)
        _counter += 1

        # Copying over the parameters to shared model
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        # Clean agent observations
        player.clear_actions()
Ejemplo n.º 18
0
def train(rank, args, shared_model, optimizer, env_conf, iters,
          checkpoint_path):
    iters = dill.loads(iters)
    if args.enable_gavel_iterator and rank == 0:
        iters._init_logger()
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    elapsed_time = 0
    start_time = time.time()

    for i in iters:
        if i % 100 == 0:
            print('GPU %d finished step %d' % (rank, i), flush=True)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
        elapsed_time += time.time() - start_time
        start_time = time.time()

        if (args.throughput_estimation_interval is not None
                and i % args.throughput_estimation_interval == 0
                and rank == 0):
            print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i))

        if (args.max_duration is not None
                and elapsed_time >= args.max_duration):
            break
    if args.enable_gavel_iterator and rank == 0:
        state = shared_model.state_dict()
        iters.save_checkpoint(state, checkpoint_path)
        iters.complete()
Ejemplo n.º 19
0
def test(args, shared_model, env_conf, shared_counter):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu')

    log = {}
    setup_logger(
        '{}_log'.format(args.env),
        os.path.join(args.log_dir, '{}-{}_log'.format(args.env,
                                                      args.exp_name)))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None, gpu_id=gpu_id)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.model.apply(weights_init)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).to(torch.float32)

    player.model = player.model.to(device)
    player.state = player.state.to(device)

    flag = True
    max_score = 0
    while True:
        if flag:
            player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean,
                    player.model.log_alpha.exp().detach().item()))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                torch.save(
                    player.model.state_dict(),
                    os.path.join(args.save_model_dir,
                                 '{}-{}.dat'.format(args.env, args.exp_name)))

            with shared_counter.get_lock():
                shared_counter.value += player.eps_len
                if shared_counter.value > args.interact_steps:
                    break
            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)

            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)
Ejemplo n.º 20
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(
        args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger(
        '{}_log'.format(args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)
    model.eval()

    state = env.reset()
    state = torch.from_numpy(state).float()
    reward_sum = 0
    done = True
    start_time = time.time()
    episode_length = 0
    num_tests = 0
    reward_total_sum = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 512), volatile=True)
            hx = Variable(torch.zeros(1, 512), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length, reward_mean))

            if reward_sum > args.save_score_level:
                model.load_state_dict(shared_model.state_dict())
                state_to_save = model.state_dict()
                torch.save(state_to_save, '{0}{1}.dat'.format(
                    args.save_model_dir, args.env))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state).float()
Ejemplo n.º 21
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)
    _ = env.reset()
    action = env.action_space.sample()
    _, _, _, info = env.step(action)
    start_lives = info['ale.lives']

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    env.seed(args.seed + rank)
    state = env.reset()
    state = torch.from_numpy(state).float()
    done = True
    episode_length = 0
    current_life = start_lives
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512))
            hx = Variable(torch.zeros(1, 512))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):

            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, info = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            if args.count_lives:
                if current_life > info['ale.lives']:
                    done = True
                else:
                    current_life = info['ale.lives']
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                current_life = start_lives
                state = env.reset()

            state = torch.from_numpy(state).float()
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:

            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Ejemplo n.º 22
0
def train_rep(args, shared_model, env_conf):
    batch_size = 16
    train_times = args.rep_train_time
    trace = []
    td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)]
    loss_fn = nn.CrossEntropyLoss()
    optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r)
    optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r)
    ptitle('Train rep')
    gpu_id = args.gpu_ids[-1]

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
            # player.model.r_net = player.model.r_net.cuda()
            # player.model.c_net = player.model.c_net.cuda()
    flag = True
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.train()
            flag = False

        player.action_test()
        trace.append(player.state)
        if len(trace) > args.trace_length:
            # 训练几百次
            for _ in range(train_times):
                range_c = np.random.randint(0, len(td_class))
                TD = np.random.randint(td_class[range_c][0],
                                       td_class[range_c][1])
                begin = np.random.randint(0, len(trace) - TD - batch_size)
                former = torch.stack(trace[begin:begin + batch_size], dim=0)
                latter = torch.stack(trace[begin + TD:begin + TD + batch_size],
                                     dim=0)
                target = torch.zeros(batch_size, dtype=torch.long) + range_c
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        former = former.cuda()
                        latter = latter.cuda()
                        target = target.cuda()

                rep_f, rep_l = player.model.r_net(former), player.model.r_net(
                    latter)
                output = player.model.c_net(rep_f, rep_l, False)
                loss = loss_fn(output, target)
                optimizer_r.zero_grad()
                optimizer_c.zero_grad()
                loss.backward()
                ensure_shared_grads(player.model.r_net,
                                    shared_model.r_net,
                                    gpu=gpu_id >= 0)
                ensure_shared_grads(player.model.c_net,
                                    shared_model.c_net,
                                    gpu=gpu_id >= 0)
                optimizer_r.step()
                optimizer_c.step()
            trace = []
        if player.done and not player.info:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True

            state = player.env.reset()
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Ejemplo n.º 23
0
def test(args, shared_model, env_conf):
    #   print('IN TEST')
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    setup_logger('{}_map_log'.format(args.env),
                 r'{0}{1}_map_log'.format(args.log_dir, args.env))
    log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format(
        args.env))

    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    if 'micropolis' in args.env.lower():
        import gym_micropolis
        env = micropolis_env(args.env, env_conf, args)
    else:
        #      print('using atari env for test')
        env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    entropy_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if 'micropolis' in args.env.lower():
        modelInit = getattr(model, args.design_head)
        player.model = modelInit(player.env.observation_space.shape[0],
                                 player.env.action_space,
                                 player.env.env.env.MAP_X)
        player.lstm_sizes = player.model.getMemorySizes()
        if not 'arcade' in args.env.lower():
            player.lstm_size = (1, 16, player.env.env.env.MAP_X,
                                env.env.env.MAP_Y)
    else:
        player.model = A3Clstm(player.env.observation_space.shape[0],
                               player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    i = 0
    while True:

        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward
        entropy_sum += player.entropy.data.item()

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean, entropy_sum))
            import numpy as np
            np.set_printoptions(threshold=400)
            log['{}_map_log'.format(args.env)].info('\n{}'.format(
                np.array2string(
                    np.add(
                        player.env.env.env.micro.map.zoneMap[-1],
                        np.full((player.env.env.env.MAP_X,
                                 player.env.env.env.MAP_Y),
                                2))).replace('\n ',
                                             '').replace('][', ']\n[').replace(
                                                 '[[', '[').replace(']]',
                                                                    ']')))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}best_{1}.dat'.format(args.save_model_dir,
                                                     args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}best_{1}.dat'.format(args.save_model_dir,
                                                 args.env))
            if i % 10 == 0:
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}latest_{1}.dat'.format(args.save_model_dir,
                                                       args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}latest_{1}.dat'.format(args.save_model_dir,
                                                   args.env))
            reward_sum = 0
            entropy_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            i += 1
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Ejemplo n.º 24
0
def train(rank, args, shared_model, optimizer, env_conf, shared_counter,
          targ_shared):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu')

    torch.manual_seed(args.seed + rank)
    torch.cuda.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None, gpu_id=gpu_id)

    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.model.apply(weights_init)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).to(torch.float32)
    player.state = player.state.to(device)
    player.model = player.model.to(device)
    #player.targ_model = copy.deepcopy(player.model)

    player.model.train()
    #player.targ_model.eval()
    player.eps_len += 2
    while True:
        player.model.load_state_dict(shared_model.state_dict())
        #player.targ_model.load_state_dict(targ_shared.state_dict())
        if player.done:
            player.cx = torch.zeros(1, 512).to(device)
            player.hx = torch.zeros(1, 512).to(device)
            #player.targ_cx = copy.deepcopy(player.cx).detach()
            #player.targ_hx = copy.deepcopy(player.hx).detach()
        else:
            player.cx = player.cx.detach()
            player.hx = player.hx.detach()

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)

        #alpha = player.model.log_alpha.exp().detach()
        alpha = .01
        #alpha = 0
        x_R = torch.zeros(1, 1)
        if not player.done:
            with torch.no_grad():
                action, value, logit, q_value, _ = player.model(
                    (player.state.unsqueeze(0), (player.hx, player.cx)))
                x_R = q_value[1].detach() - alpha * F.log_softmax(
                    logit, -1).gather(-1, action)
        x_R = x_R.to(device)
        policy_loss = 0
        adv_gae_loss = 0
        for i in reversed(range(len(player.rewards))):
            x_R = args.gamma * x_R + player.rewards[i]
            adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] -
                                           x_R.detach()).pow(2) * .5
            #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()

            policy_loss = policy_loss - (F.softmax(
                player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum(
                    -1) - alpha * player.entropies[i].unsqueeze(-1)
            #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) *
            #        player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()
            #prob = F.softmax(player.values[i], -1)
            #ent_alpha = alpha * player.entropies[i].unsqueeze(-1)
            #advs = (player.tra_adv_gae[i][0] -
            #        ((player.tra_adv_gae[i][0] * prob).sum(-1, True) +
            #         ent_alpha)).detach()
            #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha
            x_R = x_R - alpha * player.log_probs[i].detach()
        player.model.zero_grad()
        (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False)

        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        with shared_counter.get_lock():
            shared_counter.value += len(player.rewards)
            if shared_counter.value > args.interact_steps:
                break
Ejemplo n.º 25
0
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock,
          counter):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
                    player.hx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
            else:
                player.cx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
                player.hx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
        else:
            player.cx = [
                Variable(player.cx[0].data),
                Variable(player.cx[1].data)
            ]
            player.hx = [
                Variable(player.hx[0].data),
                Variable(player.cx[1].data)
            ]

        # 测试rnet的更新有没有影响到这里
        # ps = list(player.model.r_net.named_parameters())
        # n, v = ps[6]
        # print(v.sum())
        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)),
                 (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1])))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        with lock:
            counter.value += 1
        # rnet
        player.model.r_net.zero_grad()
        (args.actor_weight * policy_loss +
         (1 - args.actor_weight) * value_loss).backward(retain_graph=True)
        ensure_shared_grads(player.model.r_net,
                            shared_model.r_net,
                            gpu=gpu_id >= 0)
        optimizer_r.step()

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        player.model.r_net.zero_grad()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 26
0
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        
        R = torch.zeros(1,num_tau_prime_samples)
        if not player.done:
            logit, _, _ = player.model((Variable(
                    player.state.unsqueeze(0)), (player.hx, player.cx)))
        
            q_vals = torch.mean(logit,0)
            _, action = torch.max(q_vals,0)
            logit, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                    (player.hx, player.cx)))
            
            R = logit[:,action]

        
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()       
        #R = R.detach()
        R = Variable(R)
        
        value_loss = 0
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]

            advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples)
            #print("Ad: ",advantage)
            loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2
            #print("loss: ",loss.sum(0).sum(0), loss)
            loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa)
            #print("loss: ",loss.sum(0).sum(0), loss)
            step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa                 
            value_loss += step_loss.sum(0).mean(0)

        
        player.model.zero_grad()
        value_loss.backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 27
0
def train(rank, args, shared_model, optimizer, env_conf):

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
Ejemplo n.º 28
0
        for key in convertor_config.hyperparameters:
            exec ('hyperparameters[\'%s\'] = convertor_config.hyperparameters[\'%s\']' % (key, key))

        trainer = []
        exec ("trainer=%s(convertor_config.hyperparameters)" % convertor_config.hyperparameters['trainer'])
        trainer.gen.load_state_dict(torch.load('/home/amittel/Desktop/CMU/DRL/rl_a3c_pytorch/conversion_models/attentionbreakout2pong_v0_gen_00003500.pkl'))
        trainer.gen.eval()
        #trainer.cuda(args.gpu)
        trainer.share_memory()
        distance_gan = trainer
    else:
        convertor_config = None
        distance_gan = None
    convertor = distance_gan

    env = atari_env(args.env, env_conf, args, None, None, mapFrames=False)

    model_env = None
    if args.use_convertor:
        setup_json = read_config(args.model_env_config)
        model_env_conf = setup_json["Default"]
        for i in setup_json.keys():
            if i in args.model_env:
                model_env_conf = setup_json[i]
        model_env = atari_env(args.model_env, model_env_conf, args)


    shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)


    # (TODO): We need to load the pretrained Pong weights so that the last layer (Ac-
Ejemplo n.º 29
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    state = env.reset()
    player = Agent(model, env, args, state)
    player.state = torch.from_numpy(state).float()
    player.model.train()
    epoch = 0
    while True:

        player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            player.cx = Variable(torch.zeros(1, 512))
            player.hx = Variable(torch.zeros(1, 512))
            if player.starter:
                player = player_start(player, train=True)
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            player = player_act(player, train=True)

            if player.done:
                break

            if player.current_life > player.info['ale.lives']:
                player.flag = True
                player.current_life = player.info['ale.lives']
            else:
                player.current_life = player.info['ale.lives']
                player.flag = False
            if args.count_lives:
                if player.flag:
                    player.done = True
                    break

            if player.starter and player.flag:
                player = player_start(player, train=True)
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.flag = False

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss += 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()

        (policy_loss + value_loss).backward()

        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.values = []
        player.log_probs = []
        player.rewards = []
        player.entropies = []
def test(rank, args, shared_model):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    writer = SummaryWriter(log_dir=args.log_dir + 'tb_test')
    log = {}
    setup_logger('{}_log'.format('Test_' + str(rank)),
                 r'{0}{1}_log'.format(args.log_dir, 'Test_' + str(rank)))
    log['{}_log'.format('Test_' + str(rank))] = logging.getLogger(
        '{}_log'.format('Test_' + str(rank)))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format('Test_' + str(rank))].info('{0}: {1}'.format(
            k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(env_id=rank, args=args, type='train')
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    num_inside_target_room = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[2],
                           player.env.action_space.n)

    player.state = player.env.reset()
    player.state = normalize_rgb_obs(player.state)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()

    player.model.eval()

    action_times = 0
    while True:
        action_times += 1
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())

        player.action_test()
        reward_sum += player.reward

        if not os.path.exists(args.log_dir + "video/" + str(rank) + "_" +
                              str(num_tests)):
            os.makedirs(args.log_dir + "video/" + str(rank) + "_" +
                        str(num_tests))

        cv2.imwrite(args.log_dir + "video/" + str(rank) + "_" +
                    str(num_tests) + "/" + str(action_times) + ".png",
                    player.env.get_rgb())  # (90, 120, 3)

        if player.done:
            frame_to_video(fileloc=args.log_dir + "video/" + str(rank) + "_" +
                           str(num_tests) + "/%d.png",
                           t_w=120,
                           t_h=90,
                           destination=args.log_dir + "video/" + str(rank) +
                           "_" + str(num_tests) + ".mp4")
            shutil.rmtree(args.log_dir + "video/" + str(rank) + "_" +
                          str(num_tests))
            action_times = 0
            num_tests += 1
            num_inside_target_room += player.env.inside_target_room
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            success_rate = num_inside_target_room / num_tests
            log['{}_log'.format('Test_' + str(rank))].info(
                "Time {0}, Tester {1}, test counter {2}, episode reward {3}, episode length {4}, reward mean {5:.4f}, success rate {6}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)), rank,
                    num_tests, reward_sum, player.eps_len, reward_mean,
                    success_rate))
            # Tensorboard
            writer.add_scalar("data/episode_reward", reward_sum, num_tests)
            writer.add_scalar("data/episode_length", player.eps_len, num_tests)
            writer.add_scalar("data/reward_mean", reward_mean, num_tests)
            writer.add_scalar("data/success_rate", success_rate, num_tests)

            if reward_sum > args.save_score_level:
                # player.model.load_state_dict(shared_model.state_dict())
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}_{2}.dat'.format(args.save_model_dir,
                                                    'Test_' + str(rank),
                                                    reward_sum))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}_{2}.dat'.format(args.save_model_dir,
                                                'Test_' + str(rank),
                                                reward_sum))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(10)
            state = normalize_rgb_obs(state)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()