Esempio n. 1
0
def play_dqn(params):
    trainer = DoomTrainer(params)
    trainer.start_game()

    model = DQN(trainer.num_actions())
    softmax_body = SoftmaxBody(T=1)
    ai = AI(brain=model, body=softmax_body)

    n_steps = NStepProgress(trainer, ai, n_step=10)
    memory = ReplayMemory(n_steps=n_steps, capacity=10000)
    train_dqn(model, memory, n_steps)
Esempio n. 2
0
def play_a2c(params):
    trainer = DoomTrainer(params)
    trainer.start_game()
    model = A2C(1, len(trainer.actions))
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)

    counter = 0
    while True:
        if counter % 10 == 0:
            print("Iteration: ", counter)

        train_a2c(params, trainer, model, optimizer)
        test_a2c(params, trainer, model)
        counter += 1
Esempio n. 3
0
def play_a3c(params):
    trainer = DoomTrainer(params)
    os.environ['OMP_NUM_THREADS'] = '1'
    shared_model = A3C(1, trainer.num_actions()).cuda()
    shared_model.share_memory()

    optimizer = optimizers.SharedAdam(shared_model.parameters(), lr=params.lr)
    optimizer.share_memory()

    processes = []

    process = mp.Process(target=test_a3c,
                         args=(params.num_processes, params, shared_model))
    process.start()

    for rank in range(0, params.num_processes):
        process = mp.Process(target=train_a3c,
                             args=(rank, params, shared_model, optimizer))
        process.start()
        processes.append(process)

    for p in processes:
        p.join()
Esempio n. 4
0
def play_a3c(params):
    torch.manual_seed(params.seed)
    if params.gpu_ids == -1:
        params.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(params.seed)
        mp.set_start_method('spawn')

    torch.cuda.manual_seed(params.seed)

    trainer = DoomTrainer(params)

    # initialize shared model
    model_name = "save/" + "a3c"  # use this to save model
    shared_model = A3C(1, len(trainer.actions)).cpu()  # cannot pickle this?
    # why is this a cuda storage and not a cuda tensor
    # works when .cpu() is passed
    if params.load:
        saved_state = torch.load('{}.dat'.format(model_name),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    optimizer = optimizers.SharedAdam(shared_model.parameters(), lr=params.lr)
    optimizer.share_memory()

    processes = []

    process = mp.Process(target=test_a3c,
                         args=(params.num_processes, params, shared_model))
    process.start()
    processes.append(process)

    for rank in range(0, params.num_processes):
        process = mp.Process(target=train_a3c,
                             args=(rank, params, shared_model, optimizer))
        process.start()
        processes.append(process)

    for p in processes:
        p.join()
Esempio n. 5
0
def play_human(params):
    trainer = DoomTrainer(params)
    trainer.start_game()
    trainer.play_human()
Esempio n. 6
0
def train(rank, args, shared_model, optimizer):

    # separate gpu ids
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]

    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)

    trainer = DoomTrainer(args)
    trainer.set_seed(args.seed + rank)
    trainer.start_game()

    print("hello")
    model = A3C(1, trainer.num_actions())
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            model = model.cuda()

    trainer.new_episode()
    state = trainer.get_screen()

    done = True
    episode_length = 0

    while True:
        episode_length += 1
        model.load_state_dict(shared_model.state_dict())

        if done:
            cx = Variable(torch.zeros(1, 256)).cuda()
            hx = Variable(torch.zeros(1, 256)).cuda()
        else:
            cx = Variable(cx.data).cuda()
            hx = Variable(hx.data).cuda()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, action_values, (hx, cx) = model(
                (Variable(state.unsqueeze(0)).cuda(), (hx, cx)))
            prob = F.softmax(action_values)
            log_prob = F.log_softmax(action_values)

            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            values.append(value)
            log_probs.append(log_prob)

            reward, is_done = trainer.make_action(action[0][0])
            done = is_done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                trainer.new_episode()

            state = trainer.get_screen()
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(
                (Variable(state.unsqueeze(0)).cuda(), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0

        R = Variable(R)
        gae = torch.zeros(1, 1)

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]

            value_loss = value_loss + 0.5 * advantage.pow(2)
            TD = rewards[i] + args.gamma * values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + TD
            policy_loss = policy_loss - log_probs[i] * Variable(
                gae) - 0.01 * entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), 40)
        ensure_shared_grads(model, shared_model)

        optimizer.step()
Esempio n. 7
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)

    trainer = DoomTrainer(params)
    trainer.set_seed(params.seed + rank)
    trainer.start_game()

    model = A3C(1, trainer.num_actions()).cuda()
    model.eval()

    trainer.new_episode()
    state = trainer.get_screen()

    reward_sum = 0
    done = True
    start_time = time.time()

    episode_length = 0

    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True).cuda()
            hx = Variable(torch.zeros(1, 256), volatile=True).cuda()
        else:
            cx = Variable(cx.data, volatile=True).cuda()
            hx = Variable(hx.data, volatile=True).cuda()
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True).cuda(), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].cpu().data.numpy()

        reward, done = trainer.make_action(action[0])
        reward_sum += reward

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            log_reward(reward_sum)
            reward_sum = 0
            episode_length = 0
            trainer.new_episode()
            time.sleep(15)
        state = trainer.get_screen()
Esempio n. 8
0
def play_dqn(parameters):
    trainer = DoomTrainer(parameters)
    trainer.start_game()
    train_dqn(parameters, trainer)
Esempio n. 9
0
def test(params, trainer, model):
    trainer = DoomTrainer(params)
    trainer.start_game()

    trainer.set_seed(params.seed)
    torch.manual_seed(params.seed)

    model.eval()

    trainer.new_episode()
    state = trainer.get_screen()

    reward_sum = 0
    done = True
    start_time = time.time()

    episode_length = 0
    actions = deque(maxlen=2100)

    while True:
        episode_length += 1
        if done:
            cx = Variable(torch.zeros(1, 512), volatile=True).cuda()
            hx = Variable(torch.zeros(1, 512), volatile=True).cuda()
        else:
            cx = Variable(cx.data, volatile=True).cuda()
            hx = Variable(hx.data, volatile=True).cuda()

        value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True).cuda(), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].cpu().data.numpy()

        reward, is_done = trainer.make_action(action[0])
        done = is_done or episode_length >= params.max_episode_length
        reward_sum += reward

        actions.append(action[0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            actions.clear()
            return

        state = trainer.get_screen()