Exemple #1
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(1, env.action_space)

    model.eval()

    state = env.reset()
    state = E.process_frame42(state)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        state = E.process_frame42(state)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            state = E.process_frame42(state)
            time.sleep(60)

        state = torch.from_numpy(state)
Exemple #2
0
def test(n_episodes=5, name='LunarLander_ONE.pth'):
    env = gym.make('LunarLander-v2')
    policy = ActorCritic()

    policy.load_state_dict(torch.load('./preTrained/{}'.format(name)))

    render = True
    save_gif = False

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        running_reward = 0
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            running_reward += reward
            if render:
                env.render()
                if save_gif:
                    img = env.render(mode='rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/{}.jpg'.format(t))
            if done:
                break
        print('Episode {}\tReward: {}'.format(i_episode, running_reward))
    env.close()
Exemple #3
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        if done and counter.value > args.max_steps:
            test_final(shared_model, env, args)
            save_model(shared_model, args)
            exit()

        with torch.no_grad():
            value, logit = model(state.unsqueeze(0))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print(
                "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
Exemple #4
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank) # asynchronizing the test agent
    env = create_atari_env(params.env_name, video=True) # running an environment with a video
    env.seed(params.seed + rank) # asynchronizing the environment
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model
    model.eval() # putting the model in "eval" model because it won't be trained
    state = env.reset() # getting the input images as numpy arrays
    state = torch.from_numpy(state) # converting them into torch tensors
    reward_sum = 0 # initializing the sum of rewards to 0
    done = True # initializing done to True
    start_time = time.time() # getting the starting time to measure the computation time
    actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html
    episode_length = 0 # initializing the episode length to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        if done: # synchronizing with the shared model (same as train.py)
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action
        state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length
        reward_sum += reward
        if done: # printing the results at the end of each part
            print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            reward_sum = 0 # reinitializing the sum of rewards
            episode_length = 0 # reinitializing the episode length
            actions.clear() # reinitializing the actions
            state = env.reset() # reinitializing the environment
            time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done)
        state = torch.from_numpy(state) # new state and we continue
Exemple #5
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    torch.save(shared_model.state_dict(), 't.pkl')

    env = Env(args.seed + rank)
    model = ActorCritic(1, env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    # env.visual()

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=500)
    episode_length = 0
    while True:

        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())


        with torch.no_grad():
            value, logit = model((state.unsqueeze(0)).type(torch.FloatTensor))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()
        print(action)

        state, reward, done = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            # env.visual()
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()

            time.sleep(60)

        state = torch.from_numpy(state)
Exemple #6
0
def run(args):
    device = torch.device("cpu")
    env = gym.make('SpaceInvaders-v0')
    state_size = env.observation_space.shape
    action_size = env.action_space.n

    model = ActorCritic([1, 4, 84, 84], action_size).to(device)
    opt = SharedRMSprop(model.parameters(),
                        lr=args.lr,
                        alpha=args.alpha,
                        eps=1e-8,
                        weight_decay=args.weight_decay,
                        momentum=args.momentum,
                        centered=False)
    opt_lock = mp.Lock()
    scheduler = LRScheduler(args)

    if args.load_fp:
        checkpoint = torch.load(args.load_fp)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.train:
        start = time.time()

        model.share_memory()
        model.train()

        step_counter, max_reward, ma_reward, ma_loss = [
            mp.Value('d', 0.0) for _ in range(4)
        ]

        processes = []
        if args.num_procs == -1:
            args.num_procs = mp.cpu_count()
        for rank in range(args.num_procs):
            p = mp.Process(target=train,
                           args=(rank, args, device, model, opt, opt_lock,
                                 scheduler, step_counter, max_reward,
                                 ma_reward, ma_loss))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        if args.verbose > 0:
            print(f"Seconds taken: {time.time() - start}")
        if args.save_fp:
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': opt.state_dict(),
                },
                args.save_fp)

    if args.test:
        model.eval()
        test(args, device, model)
Exemple #7
0
def test(shared_model, render=0):
    # torch.manual_seed(rank)

    env = create_atari_env(args.rom)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    cx = hx = None
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # print logit.data.numpy()
        action = prob.max(1, keepdim=True)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        if render == 1:
            env.render()
            time.sleep(0.03)
        done = done or episode_length >= 10000
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
Exemple #8
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)  # Test ajanını asenkron yapmak için
    env = create_atari_env(params.env_name,
                           video=True)  # Ortamı video ile oynatmak için
    env.seed(params.seed + rank)  # Ortamı asenkron yapmak için

    model = ActorCritic(env.observation_space.shape[0],
                        env.action_space)  # Modelin oluşturulması
    model.eval()  # Modelin eğitim yapmaması için

    state = env.reset()  # input resmini numpy array olarak alıyoruz.
    state = torch.from_numpy(state)  # Bunu torch tensörüne çeviriyoruz.
    reward_sum = 0
    done = True
    start_time = time.time()  # Başlangıç zamanı
    actions = deque(maxlen=100)  # https://pymotw.com/2/collections/deque.html
    episode_length = 0

    while True:
        episode_length += 1  # Bölüm uzunluğunu birer birer arttırıyoruz.
        if done:  # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz.
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy(
        )  # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar.
        state, reward, done, _ = env.step(action[
            0,
            0])  # done = done or episode_length >= params.max_episode_length
        reward_sum += reward

        if done:  # Her bölümün sonunda sonucu yazdırır.
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)  # Öbür ajanları beklemek için 1 dk beklemesi için.
        state = torch.from_numpy(
            state)  # Yeni durum (state) oluşturup devam eder.
Exemple #9
0
def test(rank, args, model_path,
         all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, num):
    torch.manual_seed(args.seed + rank)

    env = Environment(args, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, random_seed=args.seed + rank)

    model = ActorCritic()
    model.load_state_dict(torch.load(model_path))
    model.eval()
    state = env.reset()
    state_time = time.time()
    episode_length = 0
    # log = open('new-result-1/test-vp-log20000.txt', 'w')
    # log = open('results-3/log20000.txt', 'w')
    # log = open('train_norway_result-2/test_log3000.txt', 'w')
    log = open('result-1/log-' + str(num) + '.txt', 'w')
    while True:
        episode_length += 1
        state = Variable(torch.FloatTensor(state))
        # print('state', state)
        logit, value = model(state.view(-1, 11, 8))
        prob = F.softmax(logit, dim=1)
        _, action = torch.max(prob, 1)
        state, reward, done, (action, vp_quality, ad_quality, out_quality, rebuf, cv, blank_ratio, reward, real_vp_bitrate, smooth) \
            = env.step(action.data.numpy()[0])
        update = True

        if update:
            print("Time {}, action {}, ({},{},{}), bitrate {:.3f}, rebuf {:.3f}, cv {:.3f}, smooth {:.3f}, reward {:.3f}, episode {}".format(
                time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)),
                action, vp_quality, ad_quality, out_quality, real_vp_bitrate, rebuf, cv, smooth,
                reward, episode_length))
            log.write('action: ' + str(action) + ' (' + str(vp_quality) + ',' + str(ad_quality) + ',' + str(out_quality)
                      + ') rebuf: ' + str(rebuf) + ' cv: ' + str(cv) + ' bitrate: ' + str(real_vp_bitrate) + ' smooth: ' + str(smooth) + ' reward: ' + str(reward)
                      + ' episode: ' + str(episode_length) + '\n')
            # log.write(str())
            # print('Time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time))))
            # print('time: ', time.gmtime(time.time() - state_time))
            # time.sleep(0.5)
        if done:
            state = env.reset()
        if episode_length == 50000:
            log.close()
            break
Exemple #10
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            save(model, 'brain.pkl')
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0])
        reward_sum += reward
        if done:
            f = open("Statistics.txt", 'a')
            f.write(str(reward_sum) + " " + str(episode_length) + "\n")
            f.close()
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
Exemple #11
0
class A2C:
    def __init__(self, state_dim, action_dim, cfg):
        self.gamma = cfg.gamma
        self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr)
        self.device = cfg.device
        self.loss = 0
        self.env = cfg.env

    def choose_action(self, state):
        state = torch.tensor([state], device=self.device, dtype=torch.float32)
        dist, value = self.model(state)
        action = dist.sample().item()
        return action, value, dist

    def update(self, values, next_values, step_rewards, log_probs, mask_dones, entropy): # 利用一回合数据进行更新
        expected_values = []
        advantages = []
        actor_losses = []
        critic_losses = []
        for step in range(len(step_rewards)):
            expected_values.append(step_rewards[step].item() + self.gamma * next_values[step].squeeze().item() * mask_dones[step].squeeze().item()) 
            advantages.append(expected_values[step] - values[step].item())
            actor_losses.append(-advantages[step] * log_probs[step].item())
            critic_losses.append(nn.MSELoss()(values[step].squeeze(), torch.tensor([expected_values[step]]).to(self.device)).cpu().detach().numpy())
        actor_loss = mean(actor_losses)
        critic_loss = mean(critic_losses)
        self.loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

    def save(self, path):
        model_checkpoint = os.path.join(path, self.env+'actor_critic.pt')
        torch.save(self.model.state_dict(), model_checkpoint)
        print('Model Saved!')

    def load(self, path):
        model_checkpoint = os.path.join(path, self.env+'actor_critic.pt')
        self.model.load_state_dict(torch.load(model_checkpoint))
        print('Model Loaded!')
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = ActorCritic(num_inputs, num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()
    running_score = 0
    steps = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            env.render()

            steps += 1
            policy, value = net(state)
            action = get_action(policy, num_actions)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        print('{} episode | score: {:.2f}'.format(e, score))
Exemple #13
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        reward_sum += reward
        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(
                60
            )  # 60 seconds break to allow the other agents to test the environment
        state = torch.from_numpy(state)
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)

    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)

        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True

        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()

        state = torch.from_numpy(state)
Exemple #15
0
class Agent(mp.Process):

    def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index,
                 env_id):
        super(Agent, self).__init__()
        self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma)
        self.global_actor_critic = global_actor_critic
        self.name = "w%02i" % name
        self.episode_index = global_ep_index
        self.env = gym.make(env_id)
        self.optimizer = optimizer

    def run(self):
        t_step = 1
        while self.episode_index.value < EPISODES:
            done = False
            observation = self.env.reset()
            score = 0
            self.local_actor_critic.clear_memory()
            while not done:
                action = self.local_actor_critic.choose_action(observation)
                observation_, reward, done, info = self.env.step(action)
                score += reward
                self.local_actor_critic.remember(observation, action, reward)
                if (t_step % T_MAX) == 0 or done:
                    loss = self.local_actor_critic.calc_loss(done)
                    self.optimizer.zero_grad()
                    loss.backward()
                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()
                t_step += 1
                observation = observation_
            with self.episode_index.get_lock():
                self.episode_index.value += 1
            print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)
Exemple #16
0
class Agent:
    def __init__(self):
        self.net = ActorCritic()
        self.net.load_state_dict(
            torch.load('models/good.pt', map_location='cpu'))
        self.net.eval()
        torch.no_grad().__enter__()  # 关闭梯度记录

    def brain(self, reversi: Reversi, who: int) -> Coordinate:
        # assert reversi.next == who
        state = torch.Tensor(getBoardState(reversi)).unsqueeze(0)
        policy = self.net(state)[1][0]

        # 保证位置合法性
        for y, x in itertools.product(range(SIZE), repeat=2):
            if not reversi.good[y][x]:
                policy[y * SIZE + x] = 0.
            else:
                policy[y * SIZE + x] += 1e-8  # 防止概率全为 0

        action = policy.max(dim=-1).indices.item()
        return (action // SIZE, action % SIZE)
Exemple #17
0
def load_checkpoint(filepath):
    #    checkpoint = torch.load(filepath)
    #    model = checkpoint['model']
    #    model.load_state_dict(checkpoint['state_dict'])
    #    for parameter in model.parameters():
    #        parameter.requires_grad = False
    #    model.eval()
    #####################
    model = ActorCritic(len(state), params.output_space)
    optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    model.eval()

    model_test = ActorCritic(len(state), params.output_space)
    optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model_test)
    model_test.load_state_dict(checkpoint['state_dict'])
    optimizer_test.load_state_dict(checkpoint['optimizer'])
    model_test.eval()
    ###########################
    return model
Exemple #18
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    avg_rew_win_size = 25
    avg_rew = 0
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    avg_rew_cnt = 0
    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward
            reward = max(min(reward, 1), -1)
            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True
                with lock:
                    counter.value += 1

            if done:
                avg_rew = avg_rew + reward_sum
                if avg_rew_cnt % avg_rew_win_size == 0:
                    print(" avg. episode reward {}".format(avg_rew /
                                                           avg_rew_win_size))
                    avg_rew = 0
                print("Time {},  episode reward {}, episode length {}".format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length))
                episode_length = 0
                reward_sum = 0
                actions.clear()
                state = env.reset()
                avg_rew_cnt = avg_rew_cnt + 1

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Exemple #19
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Exemple #20
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20
    max_episode_length = 10000
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0
    checkpoint_n = 20

    env = create_atari_env(romname)
    env.seed(SEED + rank)
    state = env.reset()
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict())
        if done:  # need to reset LSTM cell's input
            cx = Variable(torch.zeros(1, 256)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):
            t += 1
            v, logit, (hx, cx) = model((state, (hx, cx)))
            states.append(state)
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().detach()  # detach -- so the backprob will NOT go through multinomial()
            log_prob = log_prob.gather(1, action)
            action = action.data[0, 0]
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward
            done = done or t >= max_episode_length
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".
                          format(time_str, rank, episodes, reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".
                          format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint({'epoch': episodes,
                                     'average_reward': ave_reward,
                                     'time': time.time(),
                                     'state_dict': model.state_dict(),
                                     'optimizer': optimizer.state_dict(),
                                     }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(FloatTensor)
        else:
            value, _, _ = model((state, (hx, cx)))
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            advantage = R - values[i]  # type: Variable
            critic_loss += 0.5 * advantage.pow(2)
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad()
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(model, shared_model)
        optimizer.step()
Exemple #21
0
if __name__ == '__main__':
    env = create_atari_env(args.rom)
    # torch.manual_seed(SEED)
    shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()
    # print (shared_model.conv1._parameters['weight'].data.is_cuda)
    optimizer = SharedAdam(shared_model.parameters(), lr=0.0001)
    optimizer.share_memory()

    if args.play:
        if os.path.isfile(args.play):
            print("=> loading checkpoint '{}'".format(args.play))
            checkpoint = torch.load(args.play)
            #            args.start_epoch = checkpoint['epoch']
            #            best_prec1 = checkpoint['best_prec1']
            shared_model.load_state_dict(checkpoint['state_dict'])
            #optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.play))

        test(shared_model, render=1)  # let it play the game
        exit(0)

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            #            args.start_epoch = checkpoint['epoch']
            #            best_prec1 = checkpoint['best_prec1']
Exemple #22
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = WrapEnv(args.env_name)
    model = ActorCritic(4, env.num_actions, args.num_skips)

    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * (model.n_real_acts + model.n_aux_acts)

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            if not os.path.exists('model-a3c-aux'):
                os.makedirs('model-a3c-aux')
            torch.save(shared_model.state_dict(),
                       'model-a3c-aux/model-{}.pth'.format(args.model_name))
            print('saved model')

        value, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        if action_np < model.n_real_acts:
            state_new, reward, done, info = env.step(action_np)

            if args.testing:
                print('episode', episode_length, 'normal action', action_np,
                      'lives', info['ale.lives'])
                env.render()
            state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
            done = done or episode_length >= args.max_episode_length

            reward_sum += reward
            episode_length += 1
        else:
            state = state.numpy()

            for _ in range(action_np - model.n_real_acts + 2):
                state_new, rew, done, info = env.step(
                    0)  # instead of random perform NOOP=0

                if args.testing:
                    print('episode', episode_length, 'no_op action', action_np,
                          'lives', info['ale.lives'])
                    # env.render()
                state = np.append(state[1:, :, :], state_new, axis=0)
                done = done or episode_length >= args.max_episode_length

                reward_sum += rew
                episode_length += 1
                if done:
                    break

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}, aux {}".format(
                action_stat[:model.n_real_acts],
                action_stat[model.n_real_acts:]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * (model.n_real_acts + model.n_aux_acts)
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
Exemple #23
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    if not os.path.exists('models-a3c'):
        os.makedirs('models-a3c')
    path = 'models-a3c/model-{}.pth'.format(args.model_name)
    print('saving directory is', path)

    model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma)
    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * model.num_outputs

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            torch.save(shared_model.state_dict(), path)
            print('saved model')

        atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        state_new, reward, done, info = env.step(action_np)
        dead = is_dead(info)

        if args.testing:
            atoms_prob = F.softmax(atoms_logit)
            value = model.get_v(atoms_prob, batch=False)
            atoms_prob = atoms_prob.squeeze().data.numpy()

            print('episode', episode_length, 'normal action', action_np,
                  'lives', info['ale.lives'], 'value', value)
            env.render()

            if ep_counter % 100 == 0:
                plt.plot(model.z, atoms_prob)
                plt.title('average v is {}'.format(value))
                plt.show()
        state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
        done = done or episode_length >= args.max_episode_length

        reward_sum += reward
        episode_length += 1

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}".format(
                action_stat[:model.num_outputs]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter)
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * model.num_outputs
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
Exemple #24
0
def test(args, shared_model):
    action_map = _set_action_map()

    env = FixedEnvWrap()

    # time.sleep(10)
    model = ActorCritic()
    model.load_state_dict(shared_model.state_dict())
    model.eval()

    state = env.reset()

    training_time = 0
    vis = visdom.Visdom(env='final')
    line_plot = vis.line(Y=np.array([0]),
                         opts=dict(xlabel='testing count',
                                   ylabel='average reward',
                                   title='ali-v1'))

    start = time.time()
    vis_count = 0
    while True:
        video_count = 1
        reward_all_sum = 0
        reward_all = 0
        reward_all_ave = 0
        reward_gop = 0
        action = 3
        last_action = 3
        # update model before testing all trace files
        # time.sleep(5)
        print('load updated model')
        model.load_state_dict(shared_model.state_dict())
        while True:
            # get the reward for one gop
            while True:
                _, done, decision_flag = env.step_gop(action)
                if decision_flag or done:
                    reward_gop = env.get_reward_gop()
                    state = env.get_state_gop()
                    break
                else:
                    continue
            # print('testing')
            # get action from model
            last_action = action
            with torch.no_grad():
                state = torch.FloatTensor(state)
                logit, _ = model(
                    state.view(-1, args.s_gop_info, args.s_gop_len))
                prob = F.softmax(logit, dim=1)
                _, action = torch.max(prob, 1)
                action = action.data.numpy()[0]

            bitrate, target_buffer = action_map[last_action]
            # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop))
            if done:
                print("video count %d, reward is %.5f" %
                      (video_count, reward_all))
                # reward_all_sum += reward_all / 100
                reward_all_sum += reward_all
                video_count += 1
                if reward_all < 0:
                    print('bad model ! just break this loop')
                    reward_all_ave = 0
                    break
                if video_count > env.traces_len * 2:
                    reward_all_ave = reward_all_sum / video_count
                    break
                action = 3
                last_action = 3
                reward_all = 0

            reward_all += reward_gop

        # update the figure of average reward of all testing files
        vis_count += 1
        reward_all_ave = max(reward_all_ave, 0)
        vis.line(Y=np.array([reward_all_ave]),
                 X=np.array([vis_count]),
                 win=line_plot,
                 update='append')
        path = 'ali-v1/actor.pt-' + str(vis_count)
        torch.save(model.state_dict(), path)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                              seconds))
        print("average reward of traces are: ", reward_all_ave)
        print('saved one model in epoch:', vis_count)
Exemple #25
0
def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent
    env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function
    env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class
    state = env.reset() # state is a numpy array of size 1*42*42, in black & white
    state = torch.from_numpy(state) # converting the numpy array into a torch tensor
    done = True # when the game is done
    episode_length = 0 # initializing the length of an episode to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
        if done: # if it is the first iteration of the while loop or if the game was just done, then:
            cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero
            hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero
        else: # else:
            cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable
            hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable
        values = [] # initializing the list of values (V(S))
        log_probs = [] # initializing the list of log probabilities
        rewards = [] # initializing the list of rewards
        entropies = [] # initializing the list of entropies
        for step in range(params.num_steps): # going through the num_steps exploration steps
            value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
            prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
            log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
            entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x))
            entropies.append(entropy) # storing the computed entropy
            action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution
            log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action
            values.append(value) # storing the value V(S) of the state
            log_probs.append(log_prob) # storing the log prob of the action
            state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward
            done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
            reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1
            if done: # if the episode is done:
                episode_length = 0 # we restart the environment
                state = env.reset() # we restart the environment
            state = torch.from_numpy(state) # tensorizing the new state
            rewards.append(reward) # storing the new observed reward
            if done: # if we are done
                break # we stop the exploration and we directly move on to the next step: the update of the shared model
        R = torch.zeros(1, 1) # intializing the cumulative reward
        if not done: # if we are not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state
            R = value.data # we initialize the cumulative reward with the value of the last shared state
        values.append(Variable(R)) # storing the value V(S) of the last reached state S
        policy_loss = 0 # initializing the policy loss
        value_loss = 0 # initializing the value loss
        R = Variable(R) # making sure the cumulative reward R is a torch Variable
        gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0
        for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time
            R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
            advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
            value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference
            gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss
        optimizer.zero_grad() # initializing the optimizer
        (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient
        optimizer.step() # running the optimization step
Exemple #26
0
def test(rank, args, T, shared_model):
  torch.manual_seed(args.seed + rank)

  env = gym.make(args.env)
  env.seed(args.seed + rank)
  model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
  model.eval()

  can_test = True  # Test flag
  t_start = 1  # Test step counter to check against global counter
  rewards, steps = [], []  # Rewards and steps for plotting
  l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
  done = True  # Start new episode

  while T.value() <= args.T_max:
    if can_test:
      t_start = T.value()  # Reset counter

      # Evaluate over several episodes and average results
      avg_rewards, avg_episode_lengths = [], []
      for _ in range(args.evaluation_episodes):
        while True:
          # Reset or pass on hidden state
          if done:
            # Sync with shared model every episode
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            cx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            done, episode_length = False, 0
            reward_sum = 0

          # Optionally render validation states
          if args.render:
            env.render()

          # Calculate policy
          policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach()))  # Break graph for memory efficiency

          # Choose action greedily
          action = policy.max(1)[1].data[0, 0]

          # Step
          state, reward, done, _ = env.step(action)
          state = state_to_tensor(state)
          reward_sum += reward
          done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
          episode_length += 1  # Increase episode counter

          # Log and reset statistics at the end of every episode
          if done:
            avg_rewards.append(reward_sum)
            avg_episode_lengths.append(episode_length)
            break

      print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
            t_start,
            sum(avg_rewards) / args.evaluation_episodes,
            sum(avg_episode_lengths) / args.evaluation_episodes))

      if args.evaluate:
        return

      rewards.append(avg_rewards)  # Keep all evaluations
      steps.append(t_start)
      plot_line(steps, rewards)  # Plot rewards
      torch.save(model.state_dict(), 'model.pth')  # Save model params
      can_test = False  # Finish testing
    else:
      if T.value() - t_start >= args.evaluation_interval:
        can_test = True

    time.sleep(0.001)  # Check if available to test every millisecond

  env.close()
Exemple #27
0
def train(rank, params, shared_model, optimizer):
	torch.manual_seed(params.seed + rank)
	env = create_atari_env(params.env_name) #getting the environment
	env.seed(params.seed + rank)
	model = ActorCritic(env.observation_space.shape[0], env.action_space)
	state = env.reset()
	state = torch.from_numpy(state)
	done = True 
	episode_length = 0
	while True:
		episode_length+=1
		model.load_state_dict(shared_model.state_dict())
		if done:
			cx = Variable(torch.zeros(1,256))
			hx = Variable(torch.zeros(1,256))
		else:
			cx = Variable(cx.data)
			hx = Variable(hx.data)
		values = []
		log_probs = []
		rewards = []
		entropies = []
		for step in range(params.num_steps):
			value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
			prob = F.softmax(action_values)
			log_prob = F.log_softmax(action_values)
			entropy = -(log_prob * prob).sum(1)
			entropies.append(entropy)
			action = prob.multinomial().data
			log_prob = log_prob.gather(1, Variable(action))
			values.append(value)
			log_probs.append(log_prob)
			state, reward, done = env.step(action.numpy())
			done = (done or episode_length >= params.max_episode_length)
			reward = max(min(reward,1), -1)
			if done:
				episode_length = 0
				state = env.reset()
			state = torch.from_numpy(state)
			rewards.append(reward)
			if done:
				break 
		R = torch.zeros(1,1)
		if not done:
			value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx)))
			R = value.data
		values.append(Variable(R))
		policy_loss = 0
		value_loss = 0
		R = Variable(R)
		gae = torch.zeros(1,1)
		for i in reversed(range(len(rewards))):
			R = params.gamma*R + rewards[i]
			advantage = R - values[i]
			value_loss = value_loss + 0.5 * advantage.pow(2)
			TD = rewards[i] + params.gamma * values[i+1].data - values[i].data
			gae = gae * params.gamma * params.tau + TD 
			policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i]
		optimizer.zero_grad()
		(policy_loss + 0.5 * value_loss).backward()
		torch.nn.utils.clip_grad_norm(model.parameters(), 40)
		ensure_shared_grads(model, shared_model)
		optimizer.step()
Exemple #28
0
def train(rank, args, share_model, counter, lock):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    optimizer = optim.Adam(share_model.parameters(), lr=args.lr)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)
    done = True
    # reward_sum = 0
    episode_length = 0
    while True:
        model.load_state_dict(share_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            # print('reward', reward)
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            # reward_sum += reward
            # print(reward)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                # print('rank: ', rank)
                # print('reward: ', reward_sum)
                # reward_sum = 0
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()
        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)
        ensure_shared_grads(model, share_model)
        optimizer.step()
Exemple #29
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Exemple #30
0
def test(rank, args, shared_model, counter, loggers, kill):
    counter, steps, max_episodes = counter

    torch.manual_seed(args.seed + rank)

    env = create_vizdoom_env(args.config_path, args.test_scenario_path)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.spaces[0].shape[0],
                        env.action_space, args.topology)

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256),
                                                         torch.zeros(1, 256)))
    actions = deque(maxlen=100)
    episode_length = 0
    episode_counter = 0

    obs_index = 0
    obs_history = []
    pose_history = []
    goal_loc = env.goal()

    model.load_state_dict(shared_model.state_dict())

    while not kill.is_set():
        if steps.value > args.max_episode_steps:
            break

        if episode_counter > max_episodes:
            break

        try:
            episode_start_time = time.time()
            episode_length += 1

            value, logit, _, _, hidden = model((state_to_torch(state), hidden))
            prob = F.softmax(logit)
            action = prob.max(1, keepdim=True)[1].data.numpy()

            for i in range(4):
                state, reward, done, _ = env.step(action[0, 0], steps=1)
                reward_sum += reward

                if done:
                    break
                else:
                    obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype(
                        np.uint8)

                    if isinstance(obs_history, list):
                        obs_history.append(obs_frame)
                    else:
                        obs_history[obs_index, :, :, :] = obs_frame
                        obs_index += 1

                    pose_history.append(env.pose())

            # a quick hack to prevent the agent from stucking
            # actions.append(action[0, 0])
            # if actions.count(actions[0]) == actions.maxlen:
            #     done = True

            if done:
                if isinstance(obs_history, list):
                    obs_history = np.array(obs_history)

                if loggers:
                    loggers['test_reward'](env.game.get_total_reward(),
                                           episode_counter)
                    loggers['video'](video(env.wad, env.current_map, goal_loc,
                                           obs_history, pose_history),
                                     episode_counter)
                    loggers['test_time'](time.time() - episode_start_time,
                                         episode_counter)

                print(
                    "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}".
                    format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time)),
                        counter.value,
                        counter.value / (time.time() - start_time), reward_sum,
                        episode_length))
                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

                obs_index = 0
                pose_history = []
                goal_loc = env.goal()

                hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)),
                          (torch.zeros(1, 256), torch.zeros(1, 256)))

                time.sleep(args.eval_interval)

                model.load_state_dict(shared_model.state_dict())

                episode_counter += 1
        except Exception as err:
            kill.set()
            raise err