def main(): print("###############################################################") print("#################### VISDOOM LEARNER START ####################") print("###############################################################") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None global envs envs = VecEnv( [make_env(i, args.config_path) for i in range(args.num_processes)], logging=True, log_dir=args.log_dir) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c' or args.algo == 'acktr': actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) elif args.algo == 'a2t': source_models = [] files = glob.glob(os.path.join(args.source_models_path, '*.pt')) for file in files: print(file, 'loading model...') source_models.append(torch.load(file)) actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape, source_models) elif args.algo == 'resnet': # args.num_stack = 3 actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() if args.algo == 'a2c' or args.algo == 'resnet': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'a2t': a2t_params = [p for p in actor_critic.parameters() if p.requires_grad] optimizer = optim.RMSprop(a2t_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # print ('Actions:', cpu_actions, 'Rewards:', reward) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c' or args.algo == 'resnet': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) elif args.algo == 'a2t': nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo) except IOError: pass envs.close() time.sleep(5)
class KGA2CTrainer(object): ''' KGA2C main class. ''' def __init__(self, params): print("----- Initiating ----- ") print("----- step 1 configure logger") configure_logger(params['output_dir']) log('Parameters {}'.format(params)) self.params = params print("----- step 2 load pre-collected things") self.binding = load_bindings(params['rom_file_path']) self.max_word_length = self.binding['max_word_length'] self.sp = spm.SentencePieceProcessor() self.sp.Load(params['spm_file']) print("----- step 3 build KGA2CEnv") kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp, params['tsv_file'], step_limit=params['reset_steps'], stuck_steps=params['stuck_steps'], gat=params['gat']) self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path']) print("----- step 4 build FrotzEnv and templace generator") env = FrotzEnv(params['rom_file_path']) self.vocab_act, self.vocab_act_rev = load_vocab(env) self.template_generator = TemplateActionGenerator(self.binding) print("----- step 5 build kga2c model") self.model = KGA2C(params, self.template_generator.templates, self.max_word_length, self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda() if params['preload_weights']: print("load pretrained") self.model = torch.load(self.params['preload_weights'])['model'] else: print("train from scratch") print("----- step 6 set training parameters") self.batch_size = params['batch_size'] self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.loss_fn1 = nn.BCELoss() self.loss_fn2 = nn.BCEWithLogitsLoss() self.loss_fn3 = nn.MSELoss() print("----- Init finished! ----- ") def generate_targets(self, admissible, objs): ''' Generates ground-truth targets for admissible actions. :param admissible: List-of-lists of admissible actions. Batch_size x Admissible :param objs: List-of-lists of interactive objects. Batch_size x Objs :returns: template targets and object target tensors ''' tmpl_target = [] obj_targets = [] for adm in admissible: obj_t = set() cur_t = [0] * len(self.template_generator.templates) for a in adm: cur_t[a.template_id] = 1 obj_t.update(a.obj_ids) tmpl_target.append(cur_t) obj_targets.append(list(obj_t)) tmpl_target_tt = torch.FloatTensor(tmpl_target).cuda() # Note: Adjusted to use the objects in the admissible actions only object_mask_target = [] for objl in obj_targets: # in objs cur_objt = [0] * len(self.vocab_act) for o in objl: cur_objt[o] = 1 object_mask_target.append([[cur_objt], [cur_objt]]) obj_target_tt = torch.FloatTensor(object_mask_target).squeeze().cuda() return tmpl_target_tt, obj_target_tt def generate_graph_mask(self, graph_infos): assert len(graph_infos) == self.batch_size mask_all = [] for graph_info in graph_infos: mask = [0] * len(self.vocab_act.keys()) # Case 1 (default): KG as mask if self.params['masking'] == 'kg': graph_state = graph_info.graph_state # Full KG as mask --> same as KG-A2C # graph_state = graph_info.graph_state_5_mask # sub_KG_5 as mask, disabled ents = set() # Obtain entities ---> maybe I can perform graph pooling before this step for u, v in graph_state.edges: ents.add(u) ents.add(v) # Build mask: only use those related to entities for ent in ents: for ent_word in ent.split(): if ent_word[:self. max_word_length] in self.vocab_act_rev: idx = self.vocab_act_rev[ ent_word[:self.max_word_length]] mask[idx] = 1 # Case 2: interactive objects ground truth as the mask. elif self.params['masking'] == 'interactive': for o in graph_info.objs: o = o[:self.max_word_length] if o in self.vocab_act_rev.keys() and o != '': mask[self.vocab_act_rev[o]] = 1 # Case 3: no mask. elif self.params['masking'] == 'none': mask = [1] * len(self.vocab_act.keys()) else: assert False, 'Unrecognized masking {}'.format( self.params['masking']) mask_all.append(mask) return torch.BoolTensor(mask_all).cuda().detach() def discount_reward(self, transitions, last_values): returns, advantages = [], [] R = last_values.data for t in reversed(range(len(transitions))): _, _, values, rewards, done_masks, _, _, _, _, _, _ = transitions[ t] R = rewards + self.params['gamma'] * R * done_masks adv = R - values returns.append(R) advantages.append(adv) return returns[::-1], advantages[::-1] def train(self, max_steps): print("=== === === start training!!! === === ===") start = time.time() transitions = [] obs, infos, graph_infos = self.vec_env.reset() for step in range(1, max_steps + 1): # Step 1: build model inputs tb.logkv('Step', step) obs_reps = np.array([g.ob_rep for g in graph_infos]) scores = [info['score'] for info in infos] graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] graph_rep_1 = [ g.graph_state_rep_1_connectivity for g in graph_infos ] graph_rep_2 = [g.graph_state_rep_2_roomitem for g in graph_infos] graph_rep_3 = [g.graph_state_rep_3_youritem for g in graph_infos] graph_rep_4 = [g.graph_state_rep_4_otherroom for g in graph_infos] # Step 2: predict probs, actual items tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_rep_1, graph_rep_2, graph_rep_3, graph_rep_4, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Step 3: Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Step 4: Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Step 5: Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) # Step 6: Next step obs, rewards, dones, infos, graph_infos = self.vec_env.step( chosen_actions) # Step 7: logging tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) if dones[0]: log('Step {} EpisodeScore {}'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) # Step 8: append into transitions rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) # Step 9: update model per 8 steps if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) scores = [info['score'] for info in infos] graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] graph_rep_1 = [ g.graph_state_rep_1_connectivity for g in graph_infos ] graph_rep_2 = [ g.graph_state_rep_2_roomitem for g in graph_infos ] graph_rep_3 = [ g.graph_state_rep_3_youritem for g in graph_infos ] graph_rep_4 = [ g.graph_state_rep_4_otherroom for g in graph_infos ] _, _, _, _, next_value, _ = self.model( obs_reps, scores, graph_state_reps, graph_rep_1, graph_rep_2, graph_rep_3, graph_rep_4, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() print("Total time: {:.2f} mins".format( (time.time() - start) / 60.)) if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'kga2c.pt')) self.vec_env.close_extras() def update(self, transitions, returns, advantages): assert len(transitions) == len(returns) == len(advantages) loss = 0 for trans, ret, adv in zip(transitions, returns, advantages): tmpl_pred_tt, obj_pred_tt, value, _, _, tmpl_gt_tt, dec_tmpl_tt, \ dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps = trans # Supervised Template Loss tmpl_probs = F.softmax(tmpl_pred_tt, dim=1) template_loss = self.params['template_coeff'] * self.loss_fn1( tmpl_probs, tmpl_gt_tt) # Supervised Object Loss object_mask_target = obj_mask_gt_tt.permute((1, 0, 2)) obj_probs = F.softmax(obj_pred_tt, dim=2) object_mask_loss = self.params['object_coeff'] * self.loss_fn1( obj_probs, object_mask_target) # Build the object mask o1_mask, o2_mask = [0] * self.batch_size, [0] * self.batch_size for d, st in enumerate(dec_steps): if st > 1: o1_mask[d] = 1 o2_mask[d] = 1 elif st == 1: o1_mask[d] = 1 o1_mask = torch.FloatTensor(o1_mask).cuda() o2_mask = torch.FloatTensor(o2_mask).cuda() # Policy Gradient Loss policy_obj_loss = torch.FloatTensor([0]).cuda() cnt = 0 for i in range(self.batch_size): if dec_steps[i] >= 1: cnt += 1 batch_pred = obj_pred_tt[0, i, graph_mask_tt[i]] action_log_probs_obj = F.log_softmax(batch_pred, dim=0) dec_obj_idx = dec_obj_tt[0, i].item() graph_mask_list = graph_mask_tt[i].nonzero().squeeze().cpu( ).numpy().flatten().tolist() idx = graph_mask_list.index(dec_obj_idx) log_prob_obj = action_log_probs_obj[idx] policy_obj_loss += -log_prob_obj * adv[i].detach() if cnt > 0: policy_obj_loss /= cnt tb.logkv_mean('PolicyObjLoss', policy_obj_loss.item()) log_probs_obj = F.log_softmax(obj_pred_tt, dim=2) log_probs_tmpl = F.log_softmax(tmpl_pred_tt, dim=1) action_log_probs_tmpl = log_probs_tmpl.gather( 1, dec_tmpl_tt).squeeze() policy_tmpl_loss = (-action_log_probs_tmpl * adv.detach().squeeze()).mean() tb.logkv_mean('PolicyTemplateLoss', policy_tmpl_loss.item()) policy_loss = policy_tmpl_loss + policy_obj_loss value_loss = self.params['value_coeff'] * self.loss_fn3(value, ret) tmpl_entropy = -(tmpl_probs * log_probs_tmpl).mean() tb.logkv_mean('TemplateEntropy', tmpl_entropy.item()) object_entropy = -(obj_probs * log_probs_obj).mean() tb.logkv_mean('ObjectEntropy', object_entropy.item()) # Minimizing entropy loss will lead to increased entropy entropy_loss = self.params['entropy_coeff'] * -(tmpl_entropy + object_entropy) loss += template_loss + object_mask_loss + value_loss + entropy_loss + policy_loss tb.logkv('Loss', loss.item()) tb.logkv('TemplateLoss', template_loss.item()) tb.logkv('ObjectLoss', object_mask_loss.item()) tb.logkv('PolicyLoss', policy_loss.item()) tb.logkv('ValueLoss', value_loss.item()) tb.logkv('EntropyLoss', entropy_loss.item()) tb.dumpkvs() loss.backward() # Compute the gradient norm grad_norm = 0 for p in list( filter(lambda p: p.grad is not None, self.model.parameters())): grad_norm += p.grad.data.norm(2).item() tb.logkv('UnclippedGradNorm', grad_norm) nn.utils.clip_grad_norm_(self.model.parameters(), self.params['clip']) # Clipped Grad norm grad_norm = 0 for p in list( filter(lambda p: p.grad is not None, self.model.parameters())): grad_norm += p.grad.data.norm(2).item() tb.logkv('ClippedGradNorm', grad_norm) self.optimizer.step() self.optimizer.zero_grad() return loss def decode_actions(self, decoded_templates, decoded_objects): ''' Returns string representations of the given template actions. :param decoded_template: Tensor of template indices. :type decoded_template: Torch tensor of size (Batch_size x 1). :param decoded_objects: Tensor of o1, o2 object indices. :type decoded_objects: Torch tensor of size (2 x Batch_size x 1). ''' decoded_actions = [] for i in range(self.batch_size): decoded_template = decoded_templates[i].item() decoded_object1 = decoded_objects[0][i].item() decoded_object2 = decoded_objects[1][i].item() decoded_action = self.tmpl_to_str(decoded_template, decoded_object1, decoded_object2) decoded_actions.append(decoded_action) return decoded_actions def tmpl_to_str(self, template_idx, o1_id, o2_id): """ Returns a string representation of a template action. """ template_str = self.template_generator.templates[template_idx] holes = template_str.count('OBJ') assert holes <= 2 if holes <= 0: return template_str elif holes == 1: return template_str.replace('OBJ', self.vocab_act[o1_id]) else: return template_str.replace('OBJ', self.vocab_act[o1_id], 1)\ .replace('OBJ', self.vocab_act[o2_id], 1)
def main(): print("###############################################################") print("#################### VIZDOOM LEARNER START ####################") print("###############################################################") save_path = os.path.join(args.save_dir, "a2c") num_updates = int(args.num_frames) // args.num_steps // args.num_processes reward_name = "" if args.roe: reward_name = "_event" scenario_name = args.config_path.split("/")[1].split(".")[0] print("############### " + scenario_name + " ###############") log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".log" log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventlog" log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventrewardlog" start_updates = 0 start_step = 0 best_final_rewards = -1000000.0 os.environ['OMP_NUM_THREADS'] = '1' global envs es = [ make_env(i, args.config_path, visual=args.visual, bots=args.bots) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.resume: actor_critic = torch.load( os.path.join(save_path, log_file_name + ".pt")) filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0] if args.roe: e # TODO: Load event buffer with open(filename) as file: lines = file.readlines() start_updates = (int)(lines[-1].strip().split(",")[0]) start_steps = (int)(lines[-1].strip().split(",")[1]) num_updates += start_updates else: if not args.debug: try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, log_file_name)) for f in files: os.remove(f) with open(log_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_file_name)) for f in files: os.remove(f) with open(log_event_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_reward_file_name)) for f in files: os.remove(f) with open(log_event_reward_file_name, "w") as myfile: myfile.write("") actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) last_game_vars = [] for i in range(args.num_processes): last_game_vars.append(np.zeros(args.num_events)) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_intrinsic_rewards = torch.zeros([args.num_processes, 1]) final_intrinsic_rewards = torch.zeros([args.num_processes, 1]) episode_events = torch.zeros([args.num_processes, args.num_events]) final_events = torch.zeros([args.num_processes, args.num_events]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # Create event buffer if args.qd: event_buffer = EventBufferSQLProxy(args.num_events, args.capacity, args.exp_id, args.agent_id) elif not args.resume: event_buffer = EventBuffer(args.num_events, args.capacity) else: event_buffer = pickle.load( open(log_file_name + "_event_buffer_temp.p", "rb")) event_episode_rewards = [] start = time.time() for j in np.arange(start_updates, num_updates): for step in range(args.num_steps): value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info, events = envs.step(cpu_actions) intrinsic_reward = [] # Fix broken rewards - upscale for i in range(len(reward)): if scenario_name in ["deathmatch", "my_way_home"]: reward[i] *= 100 if scenario_name == "deadly_corridor": reward[i] = 1 if events[i][2] >= 1 else 0 for e in events: if args.roe: intrinsic_reward.append(event_buffer.intrinsic_reward(e)) else: r = reward[len(intrinsic_reward)] intrinsic_reward.append(r) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() intrinsic_reward = torch.from_numpy( np.expand_dims(np.stack(intrinsic_reward), 1)).float() #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float() events = torch.from_numpy(events).float() episode_rewards += reward episode_intrinsic_rewards += intrinsic_reward episode_events += events # Event stats event_rewards = [] for ei in range(0, args.num_events): ev = np.zeros(args.num_events) ev[ei] = 1 er = event_buffer.intrinsic_reward(ev) event_rewards.append(er) event_episode_rewards.append(event_rewards) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_intrinsic_rewards *= masks final_events *= masks final_rewards += (1 - masks) * episode_rewards final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards final_events += (1 - masks) * episode_events for i in range(args.num_processes): if done[i]: event_buffer.record_events(np.copy( final_events[i].numpy()), frame=j * args.num_steps) episode_rewards *= masks episode_intrinsic_rewards *= masks episode_events *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, intrinsic_reward, masks) final_episode_reward = np.mean(event_episode_rewards, axis=0) event_episode_rewards = [] next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if final_rewards.mean() > best_final_rewards and not args.debug: try: os.makedirs(save_path) except OSError: pass best_final_rewards = final_rewards.mean() save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save( save_model, os.path.join(save_path, log_file_name.split(".log")[0] + ".pt")) if j % args.save_interval == 0 and args.save_dir != "" and not args.debug: try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, log_file_name + "_temp.pt")) if isinstance(event_buffer, EventBuffer): pickle.dump(event_buffer, open(log_file_name + "_event_buffer_temp.p", "wb")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.max(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.max() ) log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \ .format(j, total_num_steps, final_rewards.mean(), final_rewards.std(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.std()) log_to_event_file = ','.join( map(str, event_buffer.get_event_mean().tolist())) + "\n" log_to_event_reward_file = ','.join( map(str, event_buffer.get_event_rewards().tolist())) + "\n" print(log) print(log_to_event_file) # Save to files with open(log_file_name, "a") as myfile: myfile.write(log_to_file) with open(log_event_file_name, "a") as myfile: myfile.write(str(total_num_steps) + "," + log_to_event_file) with open(log_event_reward_file_name, "a") as myfile: myfile.write( str(total_num_steps) + "," + log_to_event_reward_file) envs.close() time.sleep(5)
def main(): es = [make_env(i, args.board_size) for i in range(args.num_processes)] envs = VecEnv([es[i] for i in range(args.num_processes)]) spatial_obs_space = es[0].observation_space.spaces['board'].shape non_spatial_space = (1, 50) action_space = len(es[0].actions) # MODELS # if args.resume: ac_agent = torch.load("models/" + args.model_name) # Load model else: ac_agent = PrunedHybrid(spatial_obs_space[0], action_space, args.board_size) optimizer = optim.RMSprop(ac_agent.parameters(), args.learning_rate) # Creating the memory to store the steps taken if args.board_size == 1: action_space = 242 elif args.board_size == 3: action_space = 492 elif args.board_size == 5: action_space = 908 else: raise NotImplementedError("Not able to handle board size", args.board_size) memory = Memory(args.num_steps, args.num_processes, spatial_obs_space, non_spatial_space, action_space) obs = envs.reset() spatial_obs, non_spatial_obs = update_obs(obs) memory.spatial_obs[0].copy_(torch.from_numpy(spatial_obs).float()) memory.non_spatial_obs[0].copy_(torch.from_numpy(non_spatial_obs).float()) if args.resume & args.log: log_file = "logs/" + args.log_filename with open(log_file) as log: lines = log.readlines()[-1] resume_updates = float(lines.split(", ")[0]) resume_episodes = float(lines.split(", ")[1]) resume_steps = float(lines.split(", ")[3]) else: resume_updates = 0 resume_episodes = 0 resume_steps = 0 renderer = Renderer() rewards = 0 episodes = 0 for update in range(args.num_updates): for step in range(args.num_steps): available_actions = envs.actions() active_players = envs.active_players() own_players = envs.own_players() values, actions_policy = ac_agent.act( Variable(memory.spatial_obs[step]), Variable(memory.non_spatial_obs[step]), available_actions) if args.board_size == 1: actions, x_positions, y_positions = utils.map_actions_1v1(actions_policy) elif args.board_size == 3: actions, x_positions, y_positions = utils.map_actions_3v3_new_approach(actions_policy, active_players, own_players) elif args.board_size == 5: actions, x_positions, y_positions = utils.map_actions_5v5_pruned(actions_policy, active_players, own_players) else: raise NotImplementedError("Not able to handle board size", args.board_size) action_objects = [] for action, position_x, position_y in zip(actions, x_positions, y_positions): action_object = { 'action-type': action, 'x': position_x, 'y': position_y } action_objects.append(action_object) obs, reward, done, info, events = envs.step(action_objects) if args.render: for i in range(args.num_processes): renderer.render(obs[i], i) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() rewards += reward.sum().item() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) dones = masks.squeeze() episodes += args.num_processes - dones.sum().item() # Update the observations returned by the environment spatial_obs, non_spatial_obs = update_obs(obs) # insert the step taken into memory memory.insert(step, torch.from_numpy(spatial_obs).float(), torch.from_numpy(non_spatial_obs).float(), torch.tensor(actions_policy), torch.tensor(values), reward, masks, available_actions) next_value = ac_agent(Variable(memory.spatial_obs[-1]), Variable(memory.non_spatial_obs[-1]))[0].data # Compute returns memory.compute_returns(next_value, args.gamma) spatial = Variable(memory.spatial_obs[:-1]) # shape [20, 4, 26, 7, 14] spatial = spatial.view(-1, *spatial_obs_space) # shape [80, 26, 7, 14] non_spatial = Variable(memory.non_spatial_obs[:-1]) # shape [20, 4, 1, 49] non_spatial = non_spatial.view(-1, 50) # shape [80, 49] actions = Variable(torch.LongTensor(memory.actions.view(-1, 1))) actions_mask = Variable(memory.available_actions[:-1]) # Evaluate the actions taken action_log_probs, values, dist_entropy = ac_agent.evaluate_actions(Variable(spatial), Variable(non_spatial), actions, actions_mask) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(memory.returns[:-1]) - values value_loss = advantages.pow(2).mean() # Compute loss action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() total_loss = (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef) total_loss.backward() nn.utils.clip_grad_norm_(ac_agent.parameters(), args.max_grad_norm) optimizer.step() memory.non_spatial_obs[0].copy_(memory.non_spatial_obs[-1]) memory.spatial_obs[0].copy_(memory.spatial_obs[-1]) # Logging if (update + 1) % args.log_interval == 0 and args.log: log_file_name = "logs/" + args.log_filename # Updates updates = update + 1 resume_updates += updates # Episodes resume_episodes += episodes # Steps steps = args.num_processes * args.num_steps resume_steps += steps # Rewards reward = rewards mean_reward_pr_episode = reward / episodes log = "Updates {}, Episodes {}, Episodes this update {}, Total Timesteps {}, Reward {}, Mean Reward pr. Episode {:.2f}"\ .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode) log_to_file = "{}, {}, {}, {}, {}, {}\n" \ .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode) print(log) # Save to files with open(log_file_name, "a") as myfile: myfile.write(log_to_file) # Saving the agent torch.save(ac_agent, "models/" + args.model_name) rewards = 0 episodes = 0
def main(): print("###############################################################") print("#################### VIZDOOM LEARNER START ####################") print("###############################################################") save_path = os.path.join(args.save_dir, str(args.exp_id)) log_path = os.path.join(args.log_dir, str(args.exp_id)) num_updates = int(args.num_frames) // args.num_steps // args.num_processes reward_name = "" if args.roe: reward_name = "_event" scenario_name = args.config_path.split("/")[1].split(".")[0] print("############### " + scenario_name + " ###############") log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.exp_id) + "_" + str(args.agent_id) + ".log" #log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventlog" #log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventrewardlog" start_updates = 0 start_step = 0 best_final_rewards = -1000000.0 os.environ['OMP_NUM_THREADS'] = '1' cig = "cig" in args.config_path global envs es = [ make_env(i, args.config_path, visual=args.visual, cig=cig) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.resume: actor_critic = torch.load( os.path.join(save_path, f"{args.agent_id}.pt")) filename = glob.glob(os.path.join(log_path, log_file_name))[0] with open(filename) as file: lines = file.readlines() start_updates = (int)(lines[-1].strip().split(",")[0]) start_steps = (int)(lines[-1].strip().split(",")[1]) num_updates += start_updates else: try: os.makedirs(save_path) except OSError: pass try: os.makedirs(log_path) except OSError: files = glob.glob(os.path.join(args.log_dir, log_file_name)) for f in files: os.remove(f) #with open(log_file_name, "w") as myfile: # myfile.write("") #files = glob.glob(os.path.join(args.log_dir, log_event_file_name)) #for f in files: # os.remove(f) #with open(log_event_file_name, "w") as myfile: # myfile.write("") #files = glob.glob(os.path.join(args.log_dir, log_event_reward_file_name)) #for f in files: # os.remove(f) #with open(log_event_reward_file_name, "w") as myfile: # myfile.write("") actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) last_game_vars = [] for i in range(args.num_processes): last_game_vars.append(np.zeros(args.num_events)) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_intrinsic_rewards = torch.zeros([args.num_processes, 1]) final_intrinsic_rewards = torch.zeros([args.num_processes, 1]) episode_events = torch.zeros([args.num_processes, args.num_events]) final_events = torch.zeros([args.num_processes, args.num_events]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() def mean_distance_to_nearest_neighbor(elite_events): d = [] nearest = None for a in range(len(elite_events)): for b in range(len(elite_events)): if a != b: elite_a = elite_events[a] elite_b = elite_events[b] dist = np.linalg.norm(elite_a - elite_b) if nearest is None or dist < nearest: nearest = dist if nearest is not None: d.append(nearest) nearest = None return np.mean(d) def distance_to_nearest_neighbor(elite_events, events): nearest = None for elite_a in elite_events: dist = np.linalg.norm(elite_a - events) if nearest is None or dist < nearest: nearest = dist return nearest def add_to_archive(frame, episode_length): #print("Final rewards: ", final_rewards.numpy()) fitness = final_rewards.numpy().mean() #print("raw: ", final_events.numpy()) behavior = final_events.numpy().mean(axis=0) #print("Fitness:", fitness) #print("Behavior:", behavior) neighbors = event_buffer.get_neighbors(behavior, args.niche_divs, episode_length) add = len(neighbors) == 0 for neighbor in neighbors: if fitness > neighbor.fitness: add = True else: add = False break if add: if len(neighbors) > 0: event_buffer.remove_elites(neighbors) #print(f"- Removing elites {[neighbor.elite_id for neighbor in neighbors]}") for neighbor in neighbors: try: #print(f"- Deleting model {neighbor.elite_id}") os.remove( os.path.join(save_path, f"{neighbor.elite_id}.pt")) #print("Successfully deleted model with id : ", neighbor.elite_id) except: print("Error while deleting model with id : ", neighbor.elite_id) name = str(uuid.uuid1()) #print("Adding elite") event_buffer.add_elite(name, behavior, fitness, frame, episode_length) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, f"{name}.pt")) # Create event buffer event_buffer = EventBufferSQLProxy(args.num_events, args.capacity, args.exp_id, args.agent_id, qd=args.qd, per_step=args.per_step) event_episode_rewards = [] episode_finished = np.zeros(args.num_processes) start = time.time() for j in np.arange(start_updates, num_updates): for step in range(args.num_steps): value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info, events = envs.step(cpu_actions) intrinsic_reward = [] # Fix broken rewards - upscale for i in range(len(reward)): if scenario_name in ["deathmatch", "my_way_home"]: reward[i] *= 100 if scenario_name == "deadly_corridor": reward[i] = 1 if events[i][2] >= 1 else 0 for e in events: if args.roe: ir = event_buffer.intrinsic_reward(e) if args.per_step: ir = ir / 4200 intrinsic_reward.append(ir) else: r = reward[len(intrinsic_reward)] intrinsic_reward.append(r) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() intrinsic_reward = torch.from_numpy( np.expand_dims(np.stack(intrinsic_reward), 1)).float() #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float() events = torch.from_numpy(events).float() episode_rewards += reward episode_intrinsic_rewards += intrinsic_reward episode_events += events # Event stats ''' event_rewards = [] for ei in range(0,args.num_events): ev = np.zeros(args.num_events) ev[ei] = 1 er = event_buffer.intrinsic_reward(ev) if args.per_step: er = er / 4200 er = event_buffer.intrinsic_reward(ev) event_rewards.append(er) event_episode_rewards.append(event_rewards) ''' # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_intrinsic_rewards *= masks final_events *= masks final_rewards += (1 - masks) * episode_rewards final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards final_events += (1 - masks) * episode_events for i in range(args.num_processes): if done[i]: #event_buffer.record_events(np.copy(final_events[i].numpy()), frame=j*args.num_steps*args.num_processes) episode_length = (step + j * args.num_steps) - episode_finished[i] episode_finished[i] = episode_length + episode_finished[i] add_to_archive( step * args.num_processes + j * args.num_steps * args.num_processes, episode_length) episode_rewards *= masks episode_intrinsic_rewards *= masks episode_events *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, intrinsic_reward, masks) #final_episode_reward = np.mean(event_episode_rewards, axis=0) #event_episode_rewards = [] next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.max(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.max() ) log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \ .format(j, total_num_steps, final_rewards.mean(), final_rewards.std(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.std()) with open(os.path.join(log_path, log_file_name), "a") as myfile: myfile.write(log_to_file) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, f"{args.agent_id}.pt")) print(log) envs.close() time.sleep(5)
envs = VecEnv([ make_visual_env('./scenarios/deathmatch_maze.cfg') for i in range(num_envs) ]) else: envs = VecEnv([ make_env(0, './scenarios/deathmatch_maze.cfg') for i in range(num_envs) ]) # Define some actions. Each list entry corresponds to declared buttons: # MOVE_LEFT, MOVE_RIGHT, ATTACK # 5 more combinations are naturally possible but only 3 are included for transparency when watching. # actions = [[True, False, False], [False, True, False], [False, False, True]] actions = range(envs.action_space_shape) episode_num = 0 while True: print('Episode #', episode_num) for j in range(1000): action_array = [choice(actions) for i in range(num_envs)] # print (action_array) obs, reward, done, info = envs.step(action_array) if done: game_vars = envs.get_game_variables(0) print('Kills : ', game_vars[2]) episode_num += 1 break # print ('Reward:', reward) sleep(0.01) envs.reset() sleep(0.1)
total_reward = 0.0 episode_cnt = 0 episode_reward = 0.0 total_kills = 0.0 while episode_cnt < num_episodes: # sleep(0.01) print(actor_critic.get_probs(Variable(current_obs, volatile=True))) value, action = actor_critic.act(Variable(current_obs, volatile=True), deterministic=True) cpu_actions = action.data.cpu().numpy() print('Action:', [cpu_actions[0]]) # Obser reward and next obs obs, reward, done, _ = envs.step([cpu_actions[0]]) episode_reward += reward[0] if done: total_reward += episode_reward episode_cnt += 1 episode_reward = 0.0 episode_game_variables = envs.get_game_variables(0) if episode_game_variables != None: total_kills += episode_game_variables[2] obs = envs.reset() actor_critic = torch.load( os.path.join(args.load_dir, args.env_name + ".pt")) actor_critic.eval() update_current_obs(obs)
# Save frames #scipy.misc.imsave('./frames/' + scenario + '_' + str(frame) + '.jpg', current_obs.numpy()[0][0]) frame += 1 #actor_critic.vars = Variable(vars) value, action = actor_critic.act(Variable(current_obs, volatile=True), deterministic=deterministic) if deterministic: cpu_actions = action.data.cpu().numpy( ) # Enable for deterministic play else: cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, _, events = envs.step([cpu_actions[0]]) # Fix reward if scenario in ["deathmatch", "my_way_home"]: reward[0] *= 100 if scenario == "deadly_corridor": reward[0] = 1 if events[0][2] >= 1 else 0 #print('Frame', frame) #print ('Reward:', reward[0] * 100) position = envs.get_position()[0] positions_episode.append(position) #vars = torch.from_numpy(np.array(to_input_vars(vars))).float() episode_reward += reward[0] * 100