def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = ActorCritic(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    net.train()
    running_score = 0

    for e in range(3000):
        done = False
        score = 0

        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            policy, value = net(state)
            action = get_action(policy, num_actions)

            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]
            train_model(net, optimizer, transition, policy, value)

            score += reward
            state = next_state

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
Esempio n. 2
0
def run(args):
    device = torch.device("cpu")
    env = gym.make('SpaceInvaders-v0')
    state_size = env.observation_space.shape
    action_size = env.action_space.n

    model = ActorCritic([1, 4, 84, 84], action_size).to(device)
    opt = SharedRMSprop(model.parameters(),
                        lr=args.lr,
                        alpha=args.alpha,
                        eps=1e-8,
                        weight_decay=args.weight_decay,
                        momentum=args.momentum,
                        centered=False)
    opt_lock = mp.Lock()
    scheduler = LRScheduler(args)

    if args.load_fp:
        checkpoint = torch.load(args.load_fp)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.train:
        start = time.time()

        model.share_memory()
        model.train()

        step_counter, max_reward, ma_reward, ma_loss = [
            mp.Value('d', 0.0) for _ in range(4)
        ]

        processes = []
        if args.num_procs == -1:
            args.num_procs = mp.cpu_count()
        for rank in range(args.num_procs):
            p = mp.Process(target=train,
                           args=(rank, args, device, model, opt, opt_lock,
                                 scheduler, step_counter, max_reward,
                                 ma_reward, ma_loss))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        if args.verbose > 0:
            print(f"Seconds taken: {time.time() - start}")
        if args.save_fp:
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': opt.state_dict(),
                },
                args.save_fp)

    if args.test:
        model.eval()
        test(args, device, model)
Esempio n. 3
0
def coordinator(rank, args, share_model, exp_queues, model_params):
    assert len(exp_queues) == args.num_processes

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)

    model = ActorCritic()
    model.train()
    # model.load_state_dict(share_model.state_dict())
    for i in range(args.num_processes):
        model_params[i].put(model.state_dict())

    # if args.cuda:
    # model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)
    entropy_coef = args.entropy_coef

    count = 0
    while True:
        count += 1
        if count >= 14000:
            entropy_coef = 1
        if count >= 17000:
            entropy_coef = 0.5
        if count >= 19000:
            entropy_coef = 0.1

        # assemble experiences from the agents
        for i in range(args.num_processes):
            s_batch, a_batch, r_batch, done = exp_queues[i].get()
            loss = compute_loss(args, s_batch, a_batch, r_batch, done, model,
                                entropy_coef)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            if torch.isnan(loss):
                torch.save(s_batch, 's_batch-coor.pt')
                torch.save(loss, 'loss.pt')
                print('s_batch', s_batch)
                print('loss: ', loss)
                break
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            # for param in model.parameters():
            # param.grad.data.clamp_(-1, 1)
            optimizer.step()
        print('update model parameters ', count)
        if torch.isnan(loss):
            break
        # model.zero_grad()
        # if args.cuda:
        # model = model.cpu()
        for i in range(args.num_processes):
            model_params[i].put(model.state_dict())
        share_model.load_state_dict(model.state_dict())
Esempio n. 4
0
def main():
    #    try:
    parse_cmd_args()

    sess = tf.Session()
    K.set_session(sess)
    db = Database()
    env = Environment(db, argus)

    actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'],
                               size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem'])

    num_trials = argus['num_trial']  # ?
    # trial_len  = 500   # ?
    # ntp
    env.preheat()

    # First iteration
    cur_state = env._get_obs()  # np.array      (inner_metric + sql)
    cur_state = cur_state.reshape((1, env.state.shape[0]))
    # action = env.action_space.sample()
    action = env.fetch_action()  # np.array
    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape-")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    for i in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, _ = env.step(action, isPredicted, i + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))

        reward_np = np.array([reward])
        print("%d-shape-" % i)
        print(new_state.shape)

        actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
        actor_critic.train()

        cur_state = new_state
    '''
Esempio n. 5
0
def train():

    memory = []
    Transition = collections.namedtuple(
        "Transition",
        ["state", "action", "reward", "next_state", "next_action"])

    model = ActorCritic(flags.n_actions, flags.n_features, flags.lr_C,
                        flags.lr_A, flags.gamma, empty_goal_action)

    loss_his = []
    entropy_his = []
    reward_his = []

    for ii in range(flags.max_epoch):
        state = env.reset()
        init_state = state.copy()
        reward_all = 0
        done = False
        steps = 0
        loss = 0
        t_start = time.time()
        action = model.choose_action(state)

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = model.choose_action(next_state)
            reward_all += reward
            steps += 1

            if len(memory) > flags.memory_size:
                memory.pop(0)
            memory.append(
                Transition(state, action, reward, next_state, next_action))

            state = next_state
            action = next_action

        if len(memory) > flags.batch_size:
            batch_transition = random.sample(memory, flags.batch_size)
            batch_state, batch_action, batch_reward, batch_next_state, batch_next_action = map(
                np.array, zip(*batch_transition))
            loss, _ = model.train(state=batch_state,
                                  action=batch_action,
                                  reward=batch_reward,
                                  state_=batch_next_state,
                                  action_=batch_next_action)
            entropy = model.compute_entropy(init_state)

        if loss != 0:
            loss_his.append(loss)
            entropy_his.append(entropy)
            reward_his.append(reward_all)
            print("epoch=", ii, "/time=",
                  time.time() - t_start, "/loss=", loss, "/entropy=", entropy,
                  "/reward=", reward_all)

    return loss_his, entropy_his, reward_his
def train_curiosity(rank, args, shared_model, shared_curiosity, counter, lock,
                    pids, optimizer):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
        optimizer = optim.Adam(  # ICM
            chain(shared_model.parameters(), shared_curiosity.parameters()),
            lr=args.lr)

    model.train()
    curiosity.train()  # ICM

    model.load_state_dict(shared_model.state_dict())

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        inv_loss = torch.tensor(0.0)  # ICM
        forw_loss = torch.tensor(0.0)  # ICM

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
            prob = F.softmax(logit, dim=-1)

            action = prob.multinomial(num_samples=1).flatten().detach()

            state_old = state  # ICM

            state, external_reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            # external reward = 0 if ICM-only mode
            external_reward = external_reward * (1 - args.icm_only)

            # <---ICM---
            inv_out, forw_out, curiosity_reward = \
                curiosity(
                    state_old.unsqueeze(0), action,
                    state.unsqueeze(0))
            # In noreward-rl:
            # self.invloss = tf.reduce_mean(
            #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
            #     name="invloss")
            # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
            # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
            current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1),
                                          action)
            current_forw_loss = curiosity_reward
            inv_loss += current_inv_loss
            forw_loss += current_forw_loss
            # ---ICM--->

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                break

        # <---ICM---
        inv_loss = inv_loss / episode_length
        forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

        curiosity_loss = args.lambda_1 * (
            (1 - args.beta) * inv_loss + args.beta * forw_loss)
        # ---ICM--->

        optimizer.zero_grad()

        curiosity_loss.backward()  # ICM
        torch.nn.utils.clip_grad_norm_(curiosity.parameters(),
                                       args.max_grad_norm)

        ensure_shared_grads(curiosity, shared_curiosity)
        optimizer.step()

    env.close()
Esempio n. 7
0
    env.preheat()

    # First iteration
    cur_state = env._get_obs()  # np.array      (inner_metric + sql)
    cur_state = cur_state.reshape((1, env.state.shape[0]))
    # action = env.action_space.sample()
    action = env.fetch_action()  # np.array
    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, socre,  _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    predicted_rewardList = []
    for epoch in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))
        if isPredicted == 1:
            predicted_rewardList.append([epoch, reward])
Esempio n. 8
0
def train(rank,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None,
          DEBUG=False):
    if DEBUG:
        print('rank: {}'.format(rank))
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.train()
    if DEBUG:
        print('agent{:03d}: model created'.format(rank))

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
    if DEBUG:
        print('agent{:03d}: optimizer created'.format(rank))

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        if DEBUG:
            print('agent{:03d}: while loop'.format(rank))
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            if DEBUG:
                print('agent{:03d}: for loop p1'.format(rank))

            episode_length += 1
            if DEBUG:
                print('agent{:03d}: for loop p1.1'.format(rank))
                print(state.unsqueeze(0).size())
            with lock:
                value, logit, (hx, cx) = model(
                    (Variable(state.unsqueeze(0)), (hx, cx)))
            if DEBUG:
                print('agent{:03d}: for loop p2'.format(rank))
            # prob = F.softmax(logit)
            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            if DEBUG:
                print('agent{:03d}: for loop p3'.format(rank))
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)
            if DEBUG:
                print('agent{:03d}: for loop p4'.format(rank))

            action = prob.multinomial(num_samples=1).data
            log_prob = log_prob.gather(1, Variable(action))
            if DEBUG:
                print('agent{:03d}: for loop p5'.format(rank))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            if DEBUG:
                print('agent{:03d}: for loop p6'.format(rank))

            with lock:
                counter.value += 1
                if DEBUG:
                    print('agent{:03d}: counter plus {:09d}'.format(
                        rank, counter.value))

            if DEBUG:
                print('agent{:03d}: for loop p7'.format(rank))
            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 9
0
while True:

    if episode_length % steps == 0:
        model.low_lr(rate)

    if (episode_length % 1000 == 0) and (episode_length > 20000):
        if dataset == 'cifar':
            model.eval()
            map = test_util.test(Dtest, model, batch_size, bit_len)
            file = open(logpath, "a")
            file.write('#### map=' + str(map) + '\n')
            file.close()
        path = checkpoint_path + '/' + str(episode_length) + '.model'
        torch.save(model.state_dict(), path)

    model.train()

    if dataset == 'cifar':
        ori, pos, neg = traintest.get_batch_cifar_nus(batch_size)
    else:
        ori, pos, neg = traintest.get_batch_flk_nus(batch_size)

    ori = Variable(ori).cuda()
    pos = Variable(pos).cuda()
    neg = Variable(neg).cuda()

    hash_o = Variable(torch.zeros(batch_size, 1).cuda())
    hash_p = Variable(torch.zeros(batch_size, 1).cuda())
    hash_n = Variable(torch.zeros(batch_size, 1).cuda())
    probs_o = model(ori)
    probs_p = model(pos)
Esempio n. 10
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    avg_rew_win_size = 25
    avg_rew = 0
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    avg_rew_cnt = 0
    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward
            reward = max(min(reward, 1), -1)
            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True
                with lock:
                    counter.value += 1

            if done:
                avg_rew = avg_rew + reward_sum
                if avg_rew_cnt % avg_rew_win_size == 0:
                    print(" avg. episode reward {}".format(avg_rew /
                                                           avg_rew_win_size))
                    avg_rew = 0
                print("Time {},  episode reward {}, episode length {}".format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length))
                episode_length = 0
                reward_sum = 0
                actions.clear()
                state = env.reset()
                avg_rew_cnt = avg_rew_cnt + 1

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 11
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    backend, env_name = args.env_name.split(':')

    if backend == 'unity3d':
        os.chdir('/mnt/code/')
        env = create_unity3d_env(train_mode=False,\
         file_name=os.path.join(UNITYFOLDER, env_name), \
         worker_id=rank, seed=args.seed + rank, \
         docker_training=True)
    elif backend == 'gym':
        env = create_atari_env(env_name)
        env.seed(args.seed + rank)
    else:
        print(f' [!]: {backend} is not a valid backend')
        raise ValueError

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state).float()
    done = True

    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)),
                                            (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state).float()
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 12
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 13
0
def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank)  # shifting the seed with rank to asynchronize each training agent

    # creating an optimized environment thanks to the create_atari_env function
    env = create_expansionai_env(params.env_name, video=True, params=params)
    env.seed(params.seed + rank)  # aligning the seed of the environment on the seed of the agent

    # creating the model from the ActorCritic class
    model = ActorCritic(env.observation_space.shape[0], env.action_space, params)
    model.train()

    state = env.reset()  # state is a numpy array of size 1*42*42, in black & white
    logger.debug("Current training state {}".format(state))
    state = torch.from_numpy(state)  # converting the numpy array into a torch tensor

    done = True  # when the game is done
    episode_length = 0  # initializing the length of an episode to 0
    while True:  # repeat
        state = state.float()
        # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
        model.load_state_dict(shared_model.state_dict())

        if done:  # if it is the first iteration of the while loop or if the game was just done, then:
            cx = Variable(torch.zeros(1, params.lstm_size))  # the cell states of the LSTM are reinitialized to zero
            hx = Variable(torch.zeros(1, params.lstm_size))  # the hidden states of the LSTM are reinitialized to zero
        else:  # else:
            cx = Variable(cx.data)  # we keep the old cell states, making sure they are in a torch variable
            hx = Variable(hx.data)  # we keep the old hidden states, making sure they are in a torch variable

        values = []  # initializing the list of values (V(S))
        log_probs = []  # initializing the list of log probabilities
        rewards = []  # initializing the list of rewards
        entropies = []  # initializing the list of entropies

        for step in range(params.num_steps):  # going through the num_steps exploration steps
            episode_length += 1  # incrementing the episode length by one
            # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
            value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))

            # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
            prob = F.softmax(action_value)
            # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
            log_prob = F.log_softmax(action_value)
            entropy = -(log_prob * prob).sum(1)  # H(p) = - sum_x p(x).log(p(x))
            entropies.append(entropy)  # storing the computed entropy

            # selecting an actions by taking a random draw from the prob distribution
            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))  # getting the log prob associated to this selected action

            # playing the selected action, reaching the new state, and getting the new reward
            action_to_take = action.numpy()
            state, reward, done, _ = env.step(action_to_take)
            # if the episode lasts too long (the agent is stucked), then it is done
            done = (done or episode_length >= params.max_episode_length)
            reward = max(min(reward, 1), -1)  # clamping the reward between -1 and +1

            logger.debug(
                "Train action {} brought reward {} should we done {} after step {} in episode {} with state \n{}".format(
                    action_to_take,
                    reward, done,
                    step,
                    episode_length, state[1]))

            if episode_length % 100 == 0:
                logger.info(
                    "Train episode {} and current rewards {} with armies {} occupied cells {} and movable cells {}".format(
                        episode_length,
                        rewards, env.unwrapped.armies, env.unwrapped.occupied_cells_num,
                        env.unwrapped.movable_cells_num
                    ))

            if done:  # if the episode is done:
                episode_length = 0  # we restart the environment
                prev_state = state
                state = env.reset()  # we restart the environment
                logger.info(
                    "Train episode reward {}, episode length {} steps {}".format(reward, episode_length,
                                                                                 step))

            state = torch.from_numpy(state).float()  # tensorizing the new state
            values.append(value)  # storing the value V(S) of the state
            log_probs.append(log_prob)  # storing the log prob of the action
            rewards.append(reward)  # storing the new observed reward

            if done:  # if we are done
                # we stop the exploration and we directly move on to the next step: the update of the shared model
                break

        R = torch.zeros(1, 1)  # initializing the cumulative reward
        if not done:  # if we are not done:
            # we initialize the cumulative reward with the value of the last shared state
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data  # we initialize the cumulative reward with the value of the last shared state

        values.append(Variable(R))  # storing the value V(S) of the last reached state S
        policy_loss = 0  # initializing the policy loss
        value_loss = 0  # initializing the value loss
        R = Variable(R)  # making sure the cumulative reward R is a torch Variable
        gae = torch.zeros(1, 1)  # initializing the Generalized Advantage Estimation to 0

        for i in reversed(range(len(rewards))):  # starting from the last exploration step and going back in time
            # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
            R = params.gamma * R + rewards[i]
            # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)  # computing the value loss
            # computing the temporal difference
            delta_t = rewards[i] + params.gamma * values[i + 1].data - values[i].data
            # gae = sum_i (gamma*tau)^i * delta_t(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
            gae = gae * params.gamma * params.tau + delta_t
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]  # computing the policy loss

        optimizer.zero_grad()  # initializing the optimizer
        # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        # print("= Train losses \npolicy_loss {} \n value_loss {}\n".format(policy_loss, value_loss))
        torch.autograd.backward([policy_loss + 0.5 * value_loss], [torch.FloatTensor([[1, 0]])], retain_graph=True)
        # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)
        # making sure the model of the agent and the shared model share the same gradient
        ensure_shared_grads(model, shared_model)
        optimizer.step()  # running the optimization step
Esempio n. 14
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.train()

    if not args.on_policy:
        memory = EpisodicReplayMemory(args.memory_capacity,
                                      args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                hx, avg_hx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                cx, avg_cx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                action, reward, done, episode_length = 0, 0, False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                input = extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward)
                policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    Variable(input), (avg_hx, avg_cx))

                # Sample action
                action = policy.multinomial().data[
                    0,
                    0]  # Graph broken as loss for stochastic action calculated manually

                # Step
                next_state, reward, done, _ = env.step(action)
                next_state = state_to_tensor(next_state)
                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(input, action, reward,
                                  policy.data)  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards, average_policies
                    ), (policy, Q, V, Variable(torch.LongTensor([[action]])),
                        Variable(torch.Tensor([[reward]])), average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = Variable(torch.zeros(1, 1))

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(
                        extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward), None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(Variable(input), (hx, cx))
                Qret = Qret.detach()

            # Train the network on-policy
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                hx, avg_hx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))
                cx, avg_cx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    input = torch.cat((trajectory.state
                                       for trajectory in trajectories[i]), 0)
                    action = Variable(
                        torch.LongTensor([
                            trajectory.action for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    reward = Variable(
                        torch.Tensor([
                            trajectory.reward for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    old_policy = Variable(
                        torch.cat((trajectory.policy
                                   for trajectory in trajectories[i]), 0))

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               Variable(input),
                                               (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_input = torch.cat(
                        (trajectory.state
                         for trajectory in trajectories[i + 1]), 0)
                    done = Variable(
                        torch.Tensor([
                            trajectory.action is None
                            for trajectory in trajectories[i + 1]
                        ]).unsqueeze(1))

                # Do forward pass for all transitions
                _, _, Qret, _ = model(Variable(next_input), (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach()

                # Train the network off-policy
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Esempio n. 15
0
def main():
    # 确定神经网络计算设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 构建神经网络
    net = ActorCritic()
    net = net.to(device)

    # 准备优化器
    optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)

    # 准备环境
    envs = Envs(NUM_WORKERS, gamma=GAMMA)

    # 开始训练
    for episode in range(EPISODES):

        # 从多个环境采集一回合数据
        net.eval()
        with torch.no_grad():
            states = envs.reset()
            done = False
            while not done:
                states = states.to(device)
                _, policys = net(states)
                policys = policys.cpu()  # 移到CPU上处理比较好
                # 不能下的位置概率填 0
                for i in range(NUM_WORKERS):
                    if envs.reversis[i].next != 0:
                        for y, x in itertools.product(range(SIZE), repeat=2):
                            if not envs.reversis[i].good[y][x]:
                                policys[i][y * SIZE + x] = 0.
                            else:
                                policys[i][y * SIZE + x] += 1e-8  # 防止概率全为 0
                actions = Categorical(probs=policys).sample()
                done, states = envs.step(actions)

        envs.setReturn()
        data = EpisodeData(envs.readHistory())
        loader = DataLoader(data,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=2)

        # 训练网络
        net.train()

        # 相关指标
        value_loss_total = 0.
        entropy_total = 0.

        for states, actions, Returns in loader:
            states, actions, Returns = states.to(device), actions.to(
                device), Returns.to(device)
            values, policys = net(states)

            dist = Categorical(probs=policys)
            action_log_probs = dist.log_prob(actions).view(-1, 1)
            dist_entropy = dist.entropy().mean()  # 我们希望分布的熵更大些,保持模型的探索性

            advantages = Returns.view(-1, 1) - values

            value_loss = advantages.pow(2).mean()
            action_loss = -(advantages.detach() * action_log_probs).mean()

            optimizer.zero_grad()
            (VALUE_LOSS_COEF * value_loss + action_loss -
             ENTROPY_LOSS_COEF * dist_entropy).backward()
            optimizer.step()

            value_loss_total += value_loss.item()
            entropy_total += dist_entropy.item()

        print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format(
            episode, value_loss_total / len(loader),
            entropy_total / len(loader)),
              flush=True)

        if episode != 0 and episode % SAVE_INTERVAL == 0:
            if not os.path.isdir('models'):
                os.mkdir('models')
            torch.save(net.state_dict(),
                       'models/{}.pt'.format(episode // SAVE_INTERVAL))
Esempio n. 16
0
def train(rank, args, shared_model, counter, lock, logger, optimizer=None):
    if args.save_sigmas:
        sigmas_f = logger.init_one_sigmas_file(rank)

    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    if args.add_rank_reg:
        if args.rank_reg_type == "maxdividemin":
            rank_reg = MaxDivideMin.apply
        elif args.rank_reg_type == "maxminusmin":
            rank_reg = MaxMinusMin.apply

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    local_counter = 0
    episode_length = 0
    while True:
        if args.max_counter_num != 0 and counter.value > args.max_counter_num:
            exit(0)
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []
        if args.add_rank_reg:
            hiddens = [None] * 2  # 0: last layer, 1: last last layer

        for step in range(args.num_steps):
            episode_length += 1
            model_inputs = (Variable(state.unsqueeze(0)), (hx, cx))
            if args.add_rank_reg:
                value, logit, (hx, cx), internal_features = model(model_inputs, return_features=True)
            else:
                value, logit, (hx, cx) = model(model_inputs)

            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)
            if args.add_rank_reg:
                if hiddens[0] is None:
                    hiddens[0] = internal_features[-1]
                    hiddens[1] = internal_features[-2]
                else:
                    hiddens[0] = torch.cat([hiddens[0], internal_features[-1]])
                    hiddens[1] = torch.cat([hiddens[1], internal_features[-2]])

            action = prob.multinomial(num_samples=1).data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            local_counter += 1
            with lock:
                if local_counter % 20 == 0:
                    counter.value += 20

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = args.gamma * values[i + 1].data - values[i].data + rewards[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        total_loss = policy_loss + args.value_loss_coef * value_loss

        # internal layers regularizer
        retain_graph = None
        if args.add_rank_reg:
            current_rankreg_coef = args.rank_reg_coef
            # total_loss = total_loss + rank_reg(hiddens[0], args.rank_reg_coef)
            if args.save_sigmas and local_counter % args.save_sigmas_every <= 3:
                norm = rank_reg(hiddens[0], current_rankreg_coef, counter.value, sigmas_f, logger)
            else:
                norm = rank_reg(hiddens[0], current_rankreg_coef)
            total_loss = total_loss + norm

        optimizer.zero_grad()

        total_loss.backward(retain_graph=retain_graph)
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 17
0
def train(rank,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None,
          select_sample=True):
    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if args.use_cuda else torch.LongTensor

    env = setup_env(args.env_name)

    model = ActorCritic(1, env.action_space.n)

    if args.use_cuda:
        model.cuda()

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = prepro(env.reset())
    state = torch.from_numpy(state)

    done = True
    episode_length = 0
    for num_iter in count():

        if rank == 0:

            if num_iter % args.save_interval == 0 and num_iter > 0:
                #print ("Saving model at :" + args.save_path)
                torch.save(shared_model.state_dict(), args.save_path)

        if num_iter % (
                args.save_interval * 2.5
        ) == 0 and num_iter > 0 and rank == 1:  # Second saver in-case first processes crashes
            #print ("Saving model for process 1 at :" + args.save_path)
            torch.save(shared_model.state_dict(), args.save_path)

        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 512)).type(FloatTensor)
        else:
            cx = Variable(cx.data).type(FloatTensor)
            hx = Variable(hx.data).type(FloatTensor)

        values, log_probs, rewards, entropies = [], [], [], []
        actions, forwards, vec_st1s, inverses = [], [], [], []

        for step in range(args.num_steps):
            episode_length += 1
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, logit, (hx, cx) = model((state_inp, (hx, cx)), False)
            s_t = state
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(-1, keepdim=True)
            entropies.append(entropy)

            if select_sample:
                action = prob.multinomial(num_samples=1).data
            else:
                action = prob.max(-1, keepdim=True)[1].data
            log_prob = log_prob.gather(-1, Variable(action))

            action_out = action.to(torch.device("cpu"))

            oh_action = torch.Tensor(1, env.action_space.n).type(LongTensor)
            oh_action.zero_()
            oh_action.scatter_(1, action, 1)
            a_t = oh_action.type(FloatTensor)
            #print ('action', a_t)

            state, reward, done, _ = env.step(action_out.numpy()[0][0])
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            #print ('extrinsic reward', reward)

            state = torch.from_numpy(prepro(state))
            s_t1 = state

            vec_st1, inverse, forward = model(
                (Variable(s_t.unsqueeze(0)).type(FloatTensor),
                 Variable(s_t1.unsqueeze(0)).type(FloatTensor), a_t), True)
            reward_intrinsic = args.eta * (
                (vec_st1 - forward).pow(2)).sum(1) / 2.
            reward_intrinsic = reward_intrinsic.to(torch.device("cpu"))
            #print('intrinsic reward', reward_intrinsic)

            reward += reward_intrinsic
            reward1 = reward_intrinsic
            #print('total_reward', reward)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = torch.from_numpy(prepro(env.reset()))

            values.append(value)
            log_probs.append(log_prob)
            reward1 = reward1.type(FloatTensor)
            rewards.append(reward1)
            forwards.append(forward)
            vec_st1s.append(vec_st1)
            inverses.append(inverse)
            actions.append(a_t)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            state_inp = Variable(state.unsqueeze(0)).type(FloatTensor)
            value, _, _ = model((state_inp, (hx, cx)), False)
            R = value.data

        values.append(Variable(R).type(FloatTensor))
        policy_loss = 0
        value_loss = 0
        forward_loss = 0
        inverse_loss = 0
        R = Variable(R).type(FloatTensor)
        gae = torch.zeros(1, 1).type(FloatTensor)
        #print (rewards)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i]

            forward_err = forwards[i] - vec_st1s[i]
            forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1)

            cross_entropy = -(actions[i] *
                              torch.log(inverses[i] + 1e-15)).sum(1)
            inverse_loss = inverse_loss + cross_entropy

        #print ('forward loss', forward_loss)
        #print ('inverse loss', inverse_loss)
        #print ('other loss', (policy_loss + args.value_loss_coef * value_loss))
        optimizer.zero_grad()

        ((1 - args.beta) * inverse_loss +
         args.beta * forward_loss).backward(retain_graph=True)
        (args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 18
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size, args.no_noise, args.noise_entropy)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = 0, 0, False, 0
        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()
        model.sample_noise(
        )  # Pick a new noise vector (until next optimisation step)

        # Lists of outputs for training
        values, log_probs, rewards, entropies = [], [], [], []

        while not done and t - t_start < args.t_max:
            input = extend_input(state, action_to_one_hot(action, action_size),
                                 reward, episode_length)
            # Calculate policy and value
            policy, value, (hx, cx) = model(Variable(input), (hx, cx))
            log_policy = policy.log()
            entropy = -(log_policy * policy).sum(1)

            # Sample action
            action = policy.multinomial()
            log_prob = log_policy.gather(
                1, action.detach()
            )  # Graph broken as loss for stochastic action calculated manually
            action = action.data[0, 0]

            # Step
            state, reward, done, _ = env.step(action)
            state = state_to_tensor(state)
            reward = args.reward_clip and min(max(
                reward, -1), 1) or reward  # Optionally clamp rewards
            done = done or episode_length >= args.max_episode_length
            episode_length += 1  # Increase episode counter

            # Save outputs for training
            [
                arr.append(el)
                for arr, el in zip((values, log_probs, rewards,
                                    entropies), (value, log_prob, reward,
                                                 entropy))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Return R = 0 for terminal s or V(s_i; θ) for non-terminal s
        if done:
            R = Variable(torch.zeros(1, 1))
        else:
            _, R, _ = model(Variable(input), (hx, cx))
            R = R.detach()
        values.append(R)

        # Train the network
        policy_loss = 0
        value_loss = 0
        A_GAE = torch.zeros(1, 1)  # Generalised advantage estimator Ψ
        # Calculate n-step returns in forward view, stepping backwards from the last state
        trajectory_length = len(rewards)
        for i in reversed(range(trajectory_length)):
            # R ← r_i + γR
            R = rewards[i] + args.discount * R
            # Advantage A = R - V(s_i; θ)
            A = R - values[i]
            # dθ ← dθ - ∂A^2/∂θ
            value_loss += 0.5 * A**2  # Least squares error

            # TD residual δ = r + γV(s_i+1; θ) - V(s_i; θ)
            td_error = rewards[i] + args.discount * values[
                i + 1].data - values[i].data
            # Generalised advantage estimator Ψ (roughly of form ∑(γλ)^t∙δ)
            A_GAE = A_GAE * args.discount * args.trace_decay + td_error
            # dθ ← dθ + ∇θ∙log(π(a_i|s_i; θ))∙Ψ
            policy_loss -= log_probs[i] * Variable(
                A_GAE)  # Policy gradient loss
            if args.no_noise or args.noise_entropy:
                # dθ ← dθ + β∙∇θH(π(s_i; θ))
                policy_loss -= args.entropy_weight * entropies[
                    i]  # Entropy maximisation loss

        # Optionally normalise loss by number of time steps
        if not args.no_time_normalisation:
            policy_loss /= trajectory_length
            value_loss /= trajectory_length

        # Zero shared and local grads
        optimiser.zero_grad()
        # Note that losses were defined as negatives of normal update rules for gradient descent
        (policy_loss + value_loss).backward()
        # Gradient L2 normalisation
        nn.utils.clip_grad_norm(model.parameters(), args.max_gradient_norm, 2)

        # Transfer gradients to shared model and update
        _transfer_grads_to_shared_model(model, shared_model)
        optimiser.step()
        if not args.no_lr_decay:
            # Linearly decay learning rate
            _adjust_learning_rate(
                optimiser,
                max(args.lr * (args.T_max - T.value()) / args.T_max, 1e-32))

    env.close()
Esempio n. 19
0
def train(pid, rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)
    env = create_atari_env(args.env_name)
    filepath = "./train_model_" + str(rank)
    env = gym.wrappers.Monitor(env, filepath, force=True)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
    model.train()
    obs = env.reset()
    state = torch.from_numpy(obs)
    # state = get_state(obs)
    done = True
    while True:
        # if parent process is killed by "kill -9", child process kill itself
        pps = psutil.Process(pid=pid)
        try:
            if pps.status() in (psutil.STATUS_DEAD, psutil.STATUS_STOPPED):
                break
        except psutil.NoSuchProcess:
            break
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        values = []
        log_probs = []
        rewards = []
        entropies = []
        if done:
            cx = torch.zeros(1, 512)
            hx = torch.zeros(1, 512)
        else:
            cx = cx.detach()
            hx = hx.detach()

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            # sampled from the multinomial probability distribution
            action = prob.multinomial(num_samples=1).detach()  # [[1]]
            log_prob = log_prob.gather(1, action)
            obs, reward, done, _ = env.step(action.numpy())
            # done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
            # reward = max(min(reward, 1), -1)  # clamping the reward between -1 and +1
            with lock:
                counter.value += 1
            if done:
                obs = env.reset()
            state = torch.from_numpy(obs)
            entropies.append(entropy)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            if done:
                # print("step {} done {}".format(step, done))
                break

        # Gradient = ∇θ′logπ(at|st;θ′)[Rt−V(st;θv) + β∇θ′H(π(st;θ′)]
        # gae-lambda - 1.00
        # entropy-coef - 0.01
        # value-loss-coef - 0.5
        # max-grad-norm - 40
        # gamma - 0.99
        R = torch.zeros(1, 1)  # if done R=[[0]]

        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()
        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)  # Generalized Advantage Estimation
        for i in reversed(range(len(rewards))):
            # advantege = Q - V
            R = rewards[i] + args.gamma * R  # n-step
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)
            # Generalized Advantage Estimation
            td_error = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + td_error
            policy_loss = policy_loss - log_probs[i] * gae.detach(
            ) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()
        # if not work, change pytorch to 1.4.0
        (policy_loss + args.value_loss_coef * value_loss).backward(
        )  # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), args.max_grad_norm
        )  # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 20
0
def train(rank, args, share_model, counter, lock):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    optimizer = optim.Adam(share_model.parameters(), lr=args.lr)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)
    done = True
    # reward_sum = 0
    episode_length = 0
    while True:
        model.load_state_dict(share_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            # print('reward', reward)
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            # reward_sum += reward
            # print(reward)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                # print('rank: ', rank)
                # print('reward: ', reward_sum)
                # reward_sum = 0
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()
        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)
        ensure_shared_grads(model, share_model)
        optimizer.step()
Esempio n. 21
0
def train(rank, args, T, shared_model, optimiser):
	torch.manual_seed(args.seed + rank)

	env = gym.make(args.env)
	env.seed(args.seed + rank)
	model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
	model.train()

	t = 1  # Thread step counter
	epr, eploss, done  = 0, 0, True # Start new episode

	while T.value() <= args.T_max:
		while True:
			model.load_state_dict(shared_model.state_dict()) # sync with shared model
			# Get starting timestep
			t_start = t

			policies, Vs, actions, rewards = [], [], [], [] # save values for computing gradientss

			# Reset or pass on hidden state
			if done:
				hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size))
				cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size))
				# Reset environment and done flag
				state = state_to_tensor(env.reset())
				done, episode_length = False, 0
			else:
				# Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
				hx = hx.detach()
				cx = cx.detach()

			while not done and t - t_start < args.t_max:
				# Calculate policy and values
				policy, V, (hx, cx) = model(Variable(state), (hx, cx))

				# Sample action
				action = policy.multinomial().data[0, 0]

				# Step
				next_state, reward, done, _ = env.step(action)
				next_state = state_to_tensor(next_state)
				reward = args.reward_clip and min(max(reward, -1), 1) or reward  # Optionally clamp rewards
				done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
				episode_length += 1  # Increase episode counter
								
				# Save outputs for online training
				[arr.append(el) for arr, el in zip((policies, Vs, actions, rewards),
									 (policy, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]]))))]

				# Increment counters
				t += 1
				T.increment()

				# Update state
				state = next_state

			if done:
				R = Variable(torch.zeros(1, 1))
			else:
				# R = V(s_i; θ) for non-terminal s
				_, R, _ = model(Variable(state), (hx, cx))
				R = R.detach()

			# Train the network on-policy
			p_loss, v_loss = _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R)

			# Finish episode
			if done:
				break
Esempio n. 22
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model every iteration
        model.load_state_dict(shared_model.state_dict())
        if done:
            # initialization
            cx = Variable(torch.zeros(1, 128))
            hx = Variable(torch.zeros(1, 128))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            # for mujoco, env returns DoubleTensor
            value, mu, sigma_sq, (hx, cx) = model(
                (Variable(state.float().unsqueeze(0).float()), (hx, cx)))
            sigma_sq = F.softplus(sigma_sq)
            eps = torch.randn(mu.size())
            # calculate the probability
            action = (mu + sigma_sq.sqrt() * Variable(eps)).data
            prob = normal(action, mu, sigma_sq)
            entropy = -0.5 * (
                (sigma_sq + 2 * pi.expand_as(sigma_sq)).log() + 1)

            entropies.append(entropy)
            log_prob = prob.log()

            state, reward, done, _ = env.step(action.numpy())
            # prevent stuck agents
            done = done or episode_length >= args.max_episode_length
            # reward shaping
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _, _ = model(
                (Variable(state.float().unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        # calculate the rewards from the terminal state
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            # convert the data into xxx.data will stop the gradient
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            # for Mujoco, entropy loss lower to 0.0001
            policy_loss = policy_loss - (log_probs[i]*Variable(gae).expand_as(log_probs[i])).sum() \
     - (0.0001*entropies[i]).sum()

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 23
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = WrapEnv(args.env_name)
    model = ActorCritic(4, env.num_actions, args.num_skips)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    sum_reward = 0
    for ep_counter in itertools.count(1):
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit = model(Variable(state.unsqueeze(0)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            action_np = action.numpy()[0][0]
            if action_np < model.n_real_acts:
                state_new, reward, done, info = env.step(action_np)
                state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
                done = done or episode_length >= args.max_episode_length

                reward = max(min(reward, 1), -1)
                episode_length += 1
            else:
                state = state.numpy()
                reward = 0.
                for _ in range(action_np - model.n_real_acts + 2):
                    state_new, rew, done, info = env.step(
                        0)  # instead of random perform NOOP=0
                    state = np.append(state[1:, :, :], state_new, axis=0)
                    done = done or episode_length >= args.max_episode_length

                    reward += rew
                    episode_length += 1
                    if done:
                        break
                reward = max(min(reward, 1), -1)

            sum_reward += reward
            if done:
                state = env.reset()
                state = np.concatenate([state] * 4, axis=0)

                print('ep len {}, sum rew {}'.format(episode_length,
                                                     sum_reward))
                episode_length = 0
                sum_reward = 0

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(Variable(state.unsqueeze(0)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            policy_loss = policy_loss - \
                log_probs[i] * Variable(advantage.data) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40.)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 24
0
def train(rank,
          args,
          shared_model,
          counter,
          num_done,
          num_episode,
          arr,
          lock,
          optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = gym.make('MountainCar-v0').unwrapped

    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state).float()
    done = True

    episode_length = 0
    reward = 0
    gae_lambda = args.gae_lambda1
    # while True:
    while counter.value < 120000000:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 4).float()
            hx = torch.zeros(1, 4).float()
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []
        # reward = 0

        for step in range(args.num_steps):
            episode_length += 1

            with lock:
                num_episode.value += 1

            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward_step, done, _ = env.step(action.numpy()[0, 0])
            #if step % 100 == 0:
            #print(action)
            #print(state)
            #print(prob)
            # print (reward_step)
            # print(" ")
            done = done or episode_length >= args.max_episode_length
            # reward = max(min(reward, 1), -1)
            reward += reward_step * 0.01
            # print(reward)

            with lock:
                counter.value += 1

            if done:
                #print("Done")
                episode_length = 0
                reward = 10000
                state = env.reset()
                with lock:
                    num_done.value += 1

            state = torch.from_numpy(state).float()
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward_step)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()
        if counter.value > 47000000:
            gae_lambda = args.gae_lambda2
            # print ("Stage2")
            # print (gae_lambda)
        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 25
0
def train(
    rank, args, shared_model, shared_curiosity,
    counter, lock, pids, optimizer, train_policy_losses,
    train_value_losses, train_rewards
):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(
            args.env_name, rank,
            num_skip=args.num_skip, num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
        optimizer = optim.Adam(  # ICM
            chain(shared_model.parameters(), shared_curiosity.parameters()),
            lr=args.lr)

    model.train()
    curiosity.train()  # ICM

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        inv_loss = torch.tensor(0.0)   # ICM
        forw_loss = torch.tensor(0.0)  # ICM

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0),
                                           hx, cx)
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)

            # Entropy trick
            if 'sparse' in args.env_name.lower():
                max_entropy = torch.log(
                    torch.tensor(logit.size()[1], dtype=torch.float))
                entropy = entropy \
                    if entropy <= args.max_entropy_coef * max_entropy \
                    else torch.tensor(0.0)

            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).flatten().detach()
            log_prob = log_prob.gather(1, action.view(1, 1))

            state_old = state  # ICM

            state, external_reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            # external reward = 0 if ICM-only mode
            external_reward = external_reward * (1 - args.icm_only)

            # <---ICM---
            inv_out, forw_out, curiosity_reward = \
                curiosity(
                    state_old.unsqueeze(0), action,
                    state.unsqueeze(0))
            # In noreward-rl:
            # self.invloss = tf.reduce_mean(
            #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
            #     name="invloss")
            # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
            # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
            current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action)
            current_forw_loss = curiosity_reward
            inv_loss += current_inv_loss
            forw_loss += current_forw_loss

            curiosity_reward = args.eta * curiosity_reward

            reward = max(min(external_reward, args.clip), -args.clip) + \
                max(min(curiosity_reward.detach(), args.clip), -args.clip)
            # ---ICM--->

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        train_rewards[rank - 1] = sum(rewards)

        # <---ICM---
        inv_loss = inv_loss / episode_length
        forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

        curiosity_loss = args.lambda_1 * (
            (1 - args.beta) * inv_loss + args.beta * forw_loss)
        # ---ICM--->

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(state.unsqueeze(0), hx, cx)
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        train_policy_losses[rank - 1] = float((policy_loss).detach().item())
        train_value_losses[rank - 1] = float((value_loss).detach().item())

        (policy_loss + args.value_loss_coef * value_loss +
            curiosity_loss).backward()  # ICM
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        ensure_shared_grads(curiosity, shared_curiosity)
        optimizer.step()

    env.close()
Esempio n. 26
0
def local_train(index, opt, global_model, optimizer, save=False):
    torch.manual_seed(123 + index)
    if save:
        start_time = timeit.default_timer()

    writer = SummaryWriter(opt.log_path)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.train()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    curr_episode = 0

    while True:
        if save:
            if curr_episode % opt.save_interval == 0 and curr_episode > 0:
                torch.save(
                    global_model.state_dict(),
                    f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}"
                )
            print(f"Now Process {index}. Episode {curr_episode}")
        curr_episode += 1
        local_model.load_state_dict(global_model.state_dict())

        if done:
            h_0 = torch.zeros((1, 512), dtype=torch.float)
            c_0 = torch.zeros((1, 512), dtype=torch.float)
        else:
            h_0 = h_0.detach()
            c_0 = c_0.detach()

        log_policies = []
        values = []
        rewards = []
        entropies = []

        for _ in range(opt.num_local_steps):
            curr_step += 1
            logits, value, h_0, c_0 = local_model(state, h_0, c_0)
            policy = F.softmax(logits, dim=1)
            log_policy = F.log_softmax(logits, dim=1)
            entropy = -(policy * log_policy).sum(1, keepdim=True)

            m = Categorical(policy)
            action = m.sample().item()

            state, reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            if curr_step > opt.num_global_steps:
                done = True

            if done:
                curr_step = 0
                state = torch.from_numpy(env.reset())

            values.append(value)
            log_policies.append(log_policy[0, action])
            rewards.append(reward)
            entropies.append(entropy)

            if done:
                break

        R = torch.zeros((1, 1), dtype=torch.float)

        if not done:
            _, R, _, _ = local_model(state, h_0, c_0)

        gae = torch.zeros((1, 1), dtype=torch.float)
        actor_loss = 0
        critic_loss = 0
        entropy_loss = 0
        next_value = R

        for value, log_policy, reward, entropy in list(
                zip(values, log_policies, rewards, entropies))[::-1]:
            gae = gae * opt.gamma * opt.tau
            gae = gae + reward + opt.gamma * next_value.detach(
            ) - value.detach()
            next_value = value
            actor_loss = actor_loss + log_policy * gae
            R = R * opt.gamma + reward
            critic_loss = critic_loss + (R - value)**2 / 2
            entropy_loss = entropy_loss + entropy

        total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss
        writer.add_scalar(f"Train_{index}/Loss", total_loss, curr_episode)
        optimizer.zero_grad()
        total_loss.backward()

        for local_param, global_param in zip(local_model.parameters(),
                                             global_model.parameters()):
            if global_param.grad is not None:
                break
            global_param._grad = local_param.grad

        optimizer.step()

        if curr_episode == int(opt.num_global_steps / opt.num_local_steps):
            print(f"Training process {index} terminated")
            if save:
                end_time = timeit.default_timer()
                print('The code runs for %.2f s ' % (end_time - start_time))
            return
Esempio n. 27
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Esempio n. 28
0
class A3C():
    '''Implementation of N-step Asychronous Advantage Actor Critic'''
    def __init__(self, args, env, train=True):
        self.args = args
        self.set_random_seeds()
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        # Create the environment.
        self.env = gym.make(env)
        self.environment_name = env

        # Setup model.
        self.policy = ActorCritic(4, self.env.action_space.n)
        self.policy.apply(self.initialize_weights)

        # Setup critic model.
        self.critic = ActorCritic(4, self.env.action_space.n)
        self.critic.apply(self.initialize_weights)

        # Setup optimizer.
        self.eps = 1e-10  # To avoid divide-by-zero error.
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=args.policy_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Model weights path.
        self.timestamp = datetime.now().strftime(
            'a2c-breakout-%Y-%m-%d_%H-%M-%S')
        self.weights_path = 'models/%s/%s' % (self.environment_name,
                                              self.timestamp)

        # Load pretrained weights.
        if args.weights_path: self.load_model()
        self.policy.to(self.device)
        self.critic.to(self.device)

        # Video render mode.
        if args.render:
            self.policy.eval()
            self.generate_episode(render=True)
            self.plot()
            return

        # Data for plotting.
        self.rewards_data = []  # n * [epoch, mean(returns), std(returns)]

        # Network training mode.
        if train:
            # Tensorboard logging.
            self.logdir = 'logs/%s/%s' % (self.environment_name,
                                          self.timestamp)
            self.summary_writer = SummaryWriter(self.logdir)

            # Save hyperparameters.
            with open(self.logdir + '/training_parameters.json', 'w') as f:
                json.dump(vars(self.args), f, indent=4)

    def initialize_weights(self, layer):
        if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def set_random_seeds(self):
        torch.manual_seed(self.args.random_seed)
        np.random.seed(self.args.random_seed)
        torch.backends.cudnn.benchmark = True

    def save_model(self, epoch):
        '''Helper function to save model state and weights.'''
        if not os.path.exists(self.weights_path):
            os.makedirs(self.weights_path)
        torch.save(
            {
                'policy_state_dict': self.policy.state_dict(),
                'policy_optimizer': self.policy_optimizer.state_dict(),
                'critic_state_dict': self.critic.state_dict(),
                'critic_optimizer': self.critic_optimizer.state_dict(),
                'rewards_data': self.rewards_data,
                'epoch': epoch
            }, os.path.join(self.weights_path, 'model_%d.h5' % epoch))

    def load_model(self):
        '''Helper function to load model state and weights. '''
        if os.path.isfile(self.args.weights_path):
            print('=> Loading checkpoint', self.args.weights_path)
            self.checkpoint = torch.load(self.args.weights_path)
            self.policy.load_state_dict(self.checkpoint['policy_state_dict'])
            self.policy_optimizer.load_state_dict(
                self.checkpoint['policy_optimizer'])
            self.critic.load_state_dict(self.checkpoint['critic_state_dict'])
            self.critic_optimizer.load_state_dict(
                self.checkpoint['critic_optimizer'])
            self.rewards_data = self.checkpoint['rewards_data']
        else:
            raise Exception('No checkpoint found at %s' %
                            self.args.weights_path)

    def train(self):
        '''Trains the model on a single episode using REINFORCE.'''
        for epoch in range(self.args.num_episodes):
            # Generate epsiode data.
            returns, log_probs, value_function, train_rewards = self.generate_episode(
            )
            self.summary_writer.add_scalar('train/cumulative_rewards',
                                           train_rewards, epoch)
            self.summary_writer.add_scalar('train/trajectory_length',
                                           returns.size()[0], epoch)

            # Compute loss and policy gradient.
            self.policy_optimizer.zero_grad()
            policy_loss = ((returns - value_function.detach()) *
                           -log_probs).mean()
            policy_loss.backward()
            self.policy_optimizer.step()

            self.critic_optimizer.zero_grad()
            critic_loss = F.mse_loss(returns, value_function)
            critic_loss.backward()
            self.critic_optimizer.step()

            # Test the model.
            if epoch % self.args.test_interval == 0:
                self.policy.eval()
                print('\nTesting')
                rewards = [
                    self.generate_episode(test=True)
                    for epoch in range(self.args.test_episodes)
                ]
                rewards_mean, rewards_std = np.mean(rewards), np.std(rewards)
                print(
                    'Test Rewards (Mean): %.3f | Test Rewards (Std): %.3f\n' %
                    (rewards_mean, rewards_std))
                self.rewards_data.append([epoch, rewards_mean, rewards_std])
                self.summary_writer.add_scalar('test/rewards_mean',
                                               rewards_mean, epoch)
                self.summary_writer.add_scalar('test/rewards_std', rewards_std,
                                               epoch)
                self.policy.train()

            # Logging.
            if epoch % self.args.log_interval == 0:
                print(
                    'Epoch: {0:05d}/{1:05d} | Policy Loss: {2:.3f} | Value Loss: {3:.3f}'
                    .format(epoch, self.args.num_episodes, policy_loss,
                            critic_loss))
                self.summary_writer.add_scalar('train/policy_loss',
                                               policy_loss, epoch)
                self.summary_writer.add_scalar('train/critic_loss',
                                               critic_loss, epoch)

            # Save the model.
            if epoch % self.args.save_interval == 0:
                self.save_model(epoch)

        self.save_model(epoch)
        self.summary_writer.close()

    def generate_episode(self,
                         gamma=0.99,
                         test=False,
                         render=False,
                         max_iters=10000):
        '''
        Generates an episode by executing the current policy in the given env.
        Returns:
        - a list of states, indexed by time epoch
        - a list of actions, indexed by time epoch
        - a list of cumulative discounted returns, indexed by time epoch
        '''
        iters = 0
        done = False
        state = self.env.reset()

        # Set video save path if render enabled.
        if render:
            save_path = 'videos/%s/epoch-%s' % (self.environment_name,
                                                self.checkpoint['epoch'])
            if not os.path.exists(save_path): os.makedirs(save_path)
            monitor = gym.wrappers.Monitor(self.env, save_path, force=True)

        batches = []
        states = [torch.zeros(84, 84, device=self.device).float()] * 3
        rewards, returns = [], []
        actions, log_probs = [], []

        while not done:
            # Run policy on current state to log probabilities of actions.
            states.append(
                torch.tensor(preprocess(state),
                             device=self.device).float().squeeze(0))
            batches.append(torch.stack(states[-4:]))
            action_probs = self.policy.forward(
                batches[-1].unsqueeze(0)).squeeze(0)

            # Sample action from the log probabilities.
            if test and self.args.det_eval: action = torch.argmax(action_probs)
            else:
                action = torch.argmax(
                    torch.distributions.Multinomial(
                        logits=action_probs).sample())
            actions.append(action)
            log_probs.append(action_probs[action])

            # Run simulation with current action to get new state and reward.
            if render: monitor.render()
            state, reward, done, _ = self.env.step(action.cpu().numpy())
            rewards.append(reward)

            # Break if the episode takes too long.
            iters += 1
            if iters > max_iters: break

        # Save video and close rendering.
        cum_rewards = np.sum(rewards)
        if render:
            monitor.close()
            print('\nCumulative Rewards:', cum_rewards)
            return

        # Return cumulative rewards for test mode.
        if test: return cum_rewards

        # Flip rewards from T-1 to 0.
        rewards = np.array(rewards) / self.args.reward_normalizer

        # Compute value.
        values = []
        minibatches = torch.split(torch.stack(batches), 256)
        for minibatch in minibatches:
            values.append(
                self.critic.forward(minibatch, action=False).squeeze(1))
        values = torch.cat(values)
        discounted_values = values * gamma**self.args.n

        # Compute the cumulative discounted returns.
        n_step_rewards = np.zeros((1, self.args.n))
        for i in reversed(range(rewards.shape[0])):
            if i + self.args.n >= rewards.shape[0]:
                V_end = 0
            else:
                V_end = discounted_values[i + self.args.n]
            n_step_rewards[0, :-1] = n_step_rewards[0, 1:] * gamma
            n_step_rewards[0, -1] = rewards[i]

            n_step_return = torch.tensor(
                n_step_rewards.sum(), device=self.device).unsqueeze(0) + V_end
            returns.append(n_step_return)

        # Normalize returns.
        # returns = torch.stack(returns)
        # mean_return, std_return = returns.mean(), returns.std()
        # returns = (returns - mean_return) / (std_return + self.eps)

        return torch.stack(returns[::-1]).detach().squeeze(1), torch.stack(
            log_probs), values.squeeze(), cum_rewards

    def plot(self):
        # Save the plot.
        filename = os.path.join(
            'plots',
            *self.args.weights_path.split('/')[-2:]).replace('.h5', '.png')
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))

        # Make error plot with mean, std of rewards.
        data = np.asarray(self.rewards_data)
        plt.errorbar(data[:, 0],
                     data[:, 1],
                     data[:, 2],
                     lw=2.5,
                     elinewidth=1.5,
                     ecolor='grey',
                     barsabove=True,
                     capthick=2,
                     capsize=3)
        plt.title('Cumulative Rewards (Mean/Std) Plot for A3C Algorithm')
        plt.xlabel('Number of Episodes')
        plt.ylabel('Cumulative Rewards')
        plt.grid()
        plt.savefig(filename, dpi=300)
        plt.show()
Esempio n. 29
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 30
0
def train(rank, args, shared_model, optimizer=None):

    mse_loss = torch.nn.MSELoss()
    nll_loss = torch.nn.NLLLoss()

    torch.manual_seed(args.seed + rank)

    env = env_wrapper.create_doom(args.record, outdir=args.outdir)
    num_outputs = env.action_space.n
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)
        values = []
        log_probs = []
        rewards = []
        entropies = []

        inverses = []
        forwards = []
        actions = []
        vec_st1s = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)), icm=False)
            s_t = state
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            oh_action = torch.Tensor(1, num_outputs)
            oh_action.zero_()
            oh_action.scatter_(1, action, 1)
            oh_action = Variable(oh_action)
            a_t = oh_action
            actions.append(oh_action)

            state, reward, done, _ = env.step(action.numpy()[0][0])
            state = torch.from_numpy(state)

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            s_t1 = state
            vec_st1, inverse, forward = model(
                (Variable(s_t.unsqueeze(0)), Variable(s_t1.unsqueeze(0)), a_t),
                icm=True)

            reward_intrinsic = args.eta * (
                (vec_st1 - forward).pow(2)).sum(1) / 2.
            #reward_intrinsic = args.eta * ((vec_st1 - forward).pow(2)).sum(1).sqrt() / 2.
            reward_intrinsic = reward_intrinsic.data.numpy()[0][0]
            reward += reward_intrinsic

            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            vec_st1s.append(vec_st1)
            inverses.append(inverse)
            forwards.append(forward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)),
                                icm=False)
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        inverse_loss = 0
        forward_loss = 0

        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

            cross_entropy = -(actions[i] *
                              torch.log(inverses[i] + 1e-15)).sum(1)
            inverse_loss = inverse_loss + cross_entropy
            forward_err = forwards[i] - vec_st1s[i]
            forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1)

        optimizer.zero_grad()

        ((1 - args.beta) * inverse_loss +
         args.beta * forward_loss).backward(retain_variables=True)
        (args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        #(((1-args.beta) * inverse_loss + args.beta * forward_loss) + args.lmbda * (policy_loss + 0.5 * value_loss)).backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()