def play_dqn(params): trainer = DoomTrainer(params) trainer.start_game() model = DQN(trainer.num_actions()) softmax_body = SoftmaxBody(T=1) ai = AI(brain=model, body=softmax_body) n_steps = NStepProgress(trainer, ai, n_step=10) memory = ReplayMemory(n_steps=n_steps, capacity=10000) train_dqn(model, memory, n_steps)
def play_a2c(params): trainer = DoomTrainer(params) trainer.start_game() model = A2C(1, len(trainer.actions)) optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) counter = 0 while True: if counter % 10 == 0: print("Iteration: ", counter) train_a2c(params, trainer, model, optimizer) test_a2c(params, trainer, model) counter += 1
def play_a3c(params): trainer = DoomTrainer(params) os.environ['OMP_NUM_THREADS'] = '1' shared_model = A3C(1, trainer.num_actions()).cuda() shared_model.share_memory() optimizer = optimizers.SharedAdam(shared_model.parameters(), lr=params.lr) optimizer.share_memory() processes = [] process = mp.Process(target=test_a3c, args=(params.num_processes, params, shared_model)) process.start() for rank in range(0, params.num_processes): process = mp.Process(target=train_a3c, args=(rank, params, shared_model, optimizer)) process.start() processes.append(process) for p in processes: p.join()
def play_a3c(params): torch.manual_seed(params.seed) if params.gpu_ids == -1: params.gpu_ids = [-1] else: torch.cuda.manual_seed(params.seed) mp.set_start_method('spawn') torch.cuda.manual_seed(params.seed) trainer = DoomTrainer(params) # initialize shared model model_name = "save/" + "a3c" # use this to save model shared_model = A3C(1, len(trainer.actions)).cpu() # cannot pickle this? # why is this a cuda storage and not a cuda tensor # works when .cpu() is passed if params.load: saved_state = torch.load('{}.dat'.format(model_name), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() optimizer = optimizers.SharedAdam(shared_model.parameters(), lr=params.lr) optimizer.share_memory() processes = [] process = mp.Process(target=test_a3c, args=(params.num_processes, params, shared_model)) process.start() processes.append(process) for rank in range(0, params.num_processes): process = mp.Process(target=train_a3c, args=(rank, params, shared_model, optimizer)) process.start() processes.append(process) for p in processes: p.join()
def play_human(params): trainer = DoomTrainer(params) trainer.start_game() trainer.play_human()
def train(rank, args, shared_model, optimizer): # separate gpu ids gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) trainer = DoomTrainer(args) trainer.set_seed(args.seed + rank) trainer.start_game() print("hello") model = A3C(1, trainer.num_actions()) if gpu_id >= 0: with torch.cuda.device(gpu_id): model = model.cuda() trainer.new_episode() state = trainer.get_screen() done = True episode_length = 0 while True: episode_length += 1 model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)).cuda() hx = Variable(torch.zeros(1, 256)).cuda() else: cx = Variable(cx.data).cuda() hx = Variable(hx.data).cuda() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, action_values, (hx, cx) = model( (Variable(state.unsqueeze(0)).cuda(), (hx, cx))) prob = F.softmax(action_values) log_prob = F.log_softmax(action_values) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) values.append(value) log_probs.append(log_prob) reward, is_done = trainer.make_action(action[0][0]) done = is_done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if done: episode_length = 0 trainer.new_episode() state = trainer.get_screen() rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model( (Variable(state.unsqueeze(0)).cuda(), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) TD = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + TD policy_loss = policy_loss - log_probs[i] * Variable( gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) trainer = DoomTrainer(params) trainer.set_seed(params.seed + rank) trainer.start_game() model = A3C(1, trainer.num_actions()).cuda() model.eval() trainer.new_episode() state = trainer.get_screen() reward_sum = 0 done = True start_time = time.time() episode_length = 0 while True: episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True).cuda() hx = Variable(torch.zeros(1, 256), volatile=True).cuda() else: cx = Variable(cx.data, volatile=True).cuda() hx = Variable(hx.data, volatile=True).cuda() value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True).cuda(), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].cpu().data.numpy() reward, done = trainer.make_action(action[0]) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) log_reward(reward_sum) reward_sum = 0 episode_length = 0 trainer.new_episode() time.sleep(15) state = trainer.get_screen()
def play_dqn(parameters): trainer = DoomTrainer(parameters) trainer.start_game() train_dqn(parameters, trainer)
def test(params, trainer, model): trainer = DoomTrainer(params) trainer.start_game() trainer.set_seed(params.seed) torch.manual_seed(params.seed) model.eval() trainer.new_episode() state = trainer.get_screen() reward_sum = 0 done = True start_time = time.time() episode_length = 0 actions = deque(maxlen=2100) while True: episode_length += 1 if done: cx = Variable(torch.zeros(1, 512), volatile=True).cuda() hx = Variable(torch.zeros(1, 512), volatile=True).cuda() else: cx = Variable(cx.data, volatile=True).cuda() hx = Variable(hx.data, volatile=True).cuda() value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True).cuda(), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].cpu().data.numpy() reward, is_done = trainer.make_action(action[0]) done = is_done or episode_length >= params.max_episode_length reward_sum += reward actions.append(action[0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) actions.clear() return state = trainer.get_screen()