def run_eval(): config = generic.load_config() agent = Agent(config) output_dir = "." data_dir = "." # make game environments requested_infos = agent.select_additional_infos() games_dir = "./" eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env( games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], requested_infos, agent.eval_max_nb_steps_per_episode, agent.eval_batch_size, valid_or_test="test") json_file_name = agent.experiment_tag.replace(" ", "_") # load pretrained models agent.load_pretrained_model(agent.load_from_tag + ".pt", load_partial_graph=False) # evaluate if agent.real_valued_graph: agent.load_pretrained_graph_generation_model( data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") eval_game_points, eval_game_points_normalized, eval_game_step, detailed_scores = evaluate.evaluate_rl_with_real_graphs( eval_env, agent, num_eval_game) command_generation_f1 = 0.0 else: if agent.eval_g_belief: agent.load_pretrained_graph_generation_model( data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") eval_game_points, eval_game_points_normalized, eval_game_step, command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode( eval_env, agent, num_eval_game) else: eval_game_points, eval_game_points_normalized, eval_game_step, _, detailed_scores = evaluate.evaluate( eval_env, agent, num_eval_game) command_generation_f1 = 0.0 # write accuracies down into file _s = json.dumps({ "eval game points": str(eval_game_points), "eval normalized game points": str(eval_game_points_normalized), "eval steps": str(eval_game_step), "command generation f1": str(command_generation_f1), "detailed scores": detailed_scores }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush()
def train(): time_1 = datetime.datetime.now() config = generic.load_config() env = DGIData(config) env.split_reset("train") agent = Agent(config) agent.zero_noise() ave_train_loss = generic.HistoryScoreCache(capacity=500) # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() loss_win = None eval_acc_win = None viz_loss, viz_eval_loss, viz_eval_acc = [], [], [] episode_no = 0 batch_no = 0 output_dir = "." data_dir = "." json_file_name = agent.experiment_tag.replace(" ", "_") # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt", load_partial_graph=False) best_eval_acc, best_training_loss_so_far = 0.0, 10000.0 try: while (True): if episode_no > agent.max_episode: break agent.train() triplets = env.get_batch() curr_batch_size = len(triplets) loss, _, _, _ = agent.get_deep_graph_infomax_logits(triplets) # Update Model agent.online_net.zero_grad() agent.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(agent.online_net.parameters(), agent.clip_grad_norm) agent.optimizer.step() loss = generic.to_np(loss) ave_train_loss.push(loss) # lr schedule if batch_no < agent.learning_rate_warmup_until: cr = agent.init_learning_rate / math.log2( agent.learning_rate_warmup_until) learning_rate = cr * math.log2(batch_no + 1) else: learning_rate = agent.init_learning_rate for param_group in agent.optimizer.param_groups: param_group['lr'] = learning_rate episode_no += curr_batch_size batch_no += 1 if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency): continue eval_acc, eval_loss = 0.0, 0.0 if episode_no % agent.report_frequency <= ( episode_no - curr_batch_size) % agent.report_frequency: if agent.run_eval: eval_loss, eval_acc = evaluate.evaluate_deep_graph_infomax( env, agent, "valid") if eval_acc > best_eval_acc: best_eval_acc = eval_acc agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") print( "Saving best model so far! with Eval acc : {:2.3f}" .format(best_eval_acc)) env.split_reset("train") else: if loss < best_training_loss_so_far: best_training_loss_so_far = loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | sliding window loss: {:2.3f} | Eval Acc: {:2.3f} | Eval Loss: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg(), eval_acc, eval_loss)) # plot using visdom if config["general"]["visdom"]: viz_loss.append(ave_train_loss.get_avg()) viz_eval_acc.append(eval_acc) viz_eval_loss.append(eval_loss) viz_x = np.arange(len(viz_loss)).tolist() viz_eval_x = np.arange(len(viz_eval_acc)).tolist() if loss_win is None: loss_win = viz.line(X=viz_x, Y=viz_loss, opts=dict(title=agent.experiment_tag + "_loss"), name="training loss") viz.line(X=viz_eval_x, Y=viz_eval_loss, opts=dict(title=agent.experiment_tag + "_eval_loss"), win=loss_win, update='append', name="eval loss") else: viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]], opts=dict(title=agent.experiment_tag + "_loss"), win=loss_win, update='append', name="training loss") viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]], opts=dict(title=agent.experiment_tag + "_eval_loss"), win=loss_win, update='append', name="eval loss") if eval_acc_win is None: eval_acc_win = viz.line( X=viz_eval_x, Y=viz_eval_acc, opts=dict(title=agent.experiment_tag + "_eval_acc"), name="eval accuracy") else: viz.line(X=[len(viz_eval_acc) - 1], Y=[viz_eval_acc[-1]], opts=dict(title=agent.experiment_tag + "_eval_acc"), win=eval_acc_win, update='append', name="eval accuracy") # write accuracies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "loss": str(ave_train_loss.get_avg()), "eval loss": str(eval_loss), "eval accuracy": str(eval_acc) }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() # At any point you can hit Ctrl + C to break out of training early. except KeyboardInterrupt: print('--------------------------------------------') print('Exiting from training early...') if agent.run_eval: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('Evaluating on test set and saving log...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) _, _ = evaluate.evaluate_deep_graph_infomax(env, agent, "test", verbose=True)
def train(): time_1 = datetime.datetime.now() config = generic.load_config() env = ObservationGenerationData(config) env.split_reset("train") agent = Agent(config) agent.zero_noise() ave_train_loss = generic.HistoryScoreCache(capacity=500) # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() plt_win = None viz_loss, viz_eval_loss = [], [] episode_no = 0 batch_no = 0 output_dir = "." data_dir = "." json_file_name = agent.experiment_tag.replace(" ", "_") best_training_loss_so_far, best_eval_loss_so_far = 10000.0, 10000.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) elif os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_model( data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt", load_partial_graph=False) try: while (True): if episode_no > agent.max_episode: break agent.train() observation_strings, prev_action_strings = env.get_batch() training_losses, _ = agent.get_observation_infomax_loss( observation_strings, prev_action_strings) curr_batch_size = len(observation_strings) for _loss in training_losses: ave_train_loss.push(_loss) # lr schedule # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5))) if batch_no < agent.learning_rate_warmup_until: cr = agent.init_learning_rate / math.log2( agent.learning_rate_warmup_until) learning_rate = cr * math.log2(batch_no + 1) else: learning_rate = agent.init_learning_rate for param_group in agent.optimizer.param_groups: param_group['lr'] = learning_rate episode_no += curr_batch_size batch_no += 1 time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format( episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg())) if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency): continue eval_loss, eval_acc = 100000.0, 0 if episode_no % agent.report_frequency <= ( episode_no - curr_batch_size) % agent.report_frequency: if agent.run_eval: eval_loss, eval_acc = evaluate.evaluate_observation_infomax( env, agent, "valid") env.split_reset("train") # if run eval, then save model by eval accuracy if eval_loss < best_eval_loss_so_far: best_eval_loss_so_far = eval_loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: loss = ave_train_loss.get_avg() if loss < best_training_loss_so_far: best_training_loss_so_far = loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg(), eval_loss)) # plot using visdom if config["general"]["visdom"]: viz_loss.append(ave_train_loss.get_avg()) viz_eval_loss.append(eval_loss) viz_x = np.arange(len(viz_loss)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_loss, opts=dict(title=agent.experiment_tag + "_loss"), name="training loss") viz.line(X=viz_x, Y=viz_eval_loss, opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") else: viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]], opts=dict(title=agent.experiment_tag + "_loss"), win=plt_win, update='append', name="training loss") viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]], opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") # write accuracies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "loss": str(ave_train_loss.get_avg()), "eval loss": str(eval_loss), "eval accuracy": str(eval_acc) }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() # At any point you can hit Ctrl + C to break out of training early. except KeyboardInterrupt: print('--------------------------------------------') print('Exiting from training early...') if agent.run_eval: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('Evaluating on test set and saving log...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) eval_loss, eval_acc = evaluate.evaluate_observation_infomax( env, agent, "test")
def train(): time_1 = datetime.datetime.now() config = generic.load_config() agent = Agent(config) output_dir = "." data_dir = "." # make game environments requested_infos = agent.select_additional_infos_lite() requested_infos_eval = agent.select_additional_infos() games_dir = "./" # training game env env, _ = reinforcement_learning_dataset.get_training_game_env( games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], config['rl']['training_size'], requested_infos, agent.max_nb_steps_per_episode, agent.batch_size) if agent.run_eval: # training game env eval_env, num_eval_game = reinforcement_learning_dataset.get_evaluation_game_env( games_dir + config['rl']['data_path'], config['rl']['difficulty_level'], requested_infos_eval, agent.eval_max_nb_steps_per_episode, agent.eval_batch_size, valid_or_test="valid") else: eval_env, num_eval_game = None, None # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() reward_win, step_win = None, None dqn_loss_win = None eval_game_points_win, eval_step_win = None, None viz_game_rewards, viz_game_points, viz_game_points_normalized, viz_graph_rewards, viz_count_rewards, viz_step = [], [], [], [], [], [] viz_dqn_loss = [] viz_eval_game_points, viz_eval_game_points_normalized, viz_eval_step = [], [], [] step_in_total = 0 episode_no = 0 running_avg_game_points = HistoryScoreCache(capacity=500) running_avg_game_points_normalized = HistoryScoreCache(capacity=500) running_avg_graph_rewards = HistoryScoreCache(capacity=500) running_avg_count_rewards = HistoryScoreCache(capacity=500) running_avg_game_steps = HistoryScoreCache(capacity=500) running_avg_dqn_loss = HistoryScoreCache(capacity=500) running_avg_game_rewards = HistoryScoreCache(capacity=500) json_file_name = agent.experiment_tag.replace(" ", "_") best_train_performance_so_far, best_eval_performance_so_far = 0.0, 0.0 prev_performance = 0.0 if os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_graph_generation_model( data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") else: print( "No graph updater module detected... Please check ", data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt") # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() elif os.path.exists(data_dir + "/" + agent.load_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_from_tag + ".pt") agent.update_target_net() i_have_seen_these_states = EpisodicCountingMemory( ) # episodic counting based memory i_am_patient = 0 perfect_training = 0 while (True): if episode_no > agent.max_episode: break np.random.seed(episode_no) env.seed(episode_no) obs, infos = env.reset() # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [ cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"] ]: commands_.remove(cmd_) batch_size = len(obs) agent.train() agent.init() game_name_list = [ game.metadata["uuid"].split("-")[-1] for game in infos["game"] ] game_max_score_list = [game.max_score for game in infos["game"]] i_have_seen_these_states.reset( ) # reset episodic counting based memory prev_triplets, chosen_actions = [], [] prev_step_dones, prev_rewards = [], [] for _ in range(batch_size): prev_triplets.append([]) chosen_actions.append("restart") prev_step_dones.append(0.0) prev_rewards.append(0.0) prev_h, prev_c = None, None observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite( obs, infos) observation_for_counting = copy.copy(observation_strings) observation_strings = [ item + " <sep> " + a for item, a in zip(observation_strings, chosen_actions) ] # generate g_belief begins generated_commands = agent.command_generation_greedy_generation( observation_strings, prev_triplets) current_triplets = agent.update_knowledge_graph_triplets( prev_triplets, generated_commands) # generate g_belief ends i_have_seen_these_states.push( current_triplets) # update init triplets into memory if agent.count_reward_lambda > 0: agent.reset_binarized_counter(batch_size) _ = agent.get_binarized_count(observation_for_counting) # it requires to store sequences of transitions into memory with order, # so we use a cache to keep what agents returns, and push them into memory # altogether in the end of game. transition_cache = [] still_running_mask = [] game_rewards, game_points, graph_rewards, count_rewards = [], [], [], [] print_actions = [] act_randomly = False if agent.noisy_net else episode_no < agent.learn_start_from_this_episode for step_no in range(agent.max_nb_steps_per_episode): if agent.noisy_net: agent.reset_noise() # Draw a new set of noisy weights new_chosen_actions, chosen_indices, prev_h, prev_c = agent.act( observation_strings, current_triplets, action_candidate_list, previous_h=prev_h, previous_c=prev_c, random=act_randomly) replay_info = [ observation_strings, action_candidate_list, chosen_indices, current_triplets, chosen_actions ] transition_cache.append(replay_info) chosen_actions = new_chosen_actions chosen_actions_before_parsing = [ item[idx] for item, idx in zip(infos["admissible_commands"], chosen_indices) ] obs, scores, dones, infos = env.step(chosen_actions_before_parsing) # filter look and examine actions for commands_ in infos["admissible_commands"]: for cmd_ in [ cmd for cmd in commands_ if cmd != "examine cookbook" and cmd.split()[0] in ["examine", "look"] ]: commands_.remove(cmd_) prev_triplets = current_triplets observation_strings, action_candidate_list = agent.get_game_info_at_certain_step_lite( obs, infos) observation_for_counting = copy.copy(observation_strings) observation_strings = [ item + " <sep> " + a for item, a in zip(observation_strings, chosen_actions) ] # generate g_belief begins generated_commands = agent.command_generation_greedy_generation( observation_strings, prev_triplets) current_triplets = agent.update_knowledge_graph_triplets( prev_triplets, generated_commands) # generate g_belief ends has_not_seen = i_have_seen_these_states.has_not_seen( current_triplets) i_have_seen_these_states.push( current_triplets) # update init triplets into memory if agent.noisy_net and step_in_total % agent.update_per_k_game_steps == 0: agent.reset_noise() # Draw a new set of noisy weights if episode_no >= agent.learn_start_from_this_episode and step_in_total % agent.update_per_k_game_steps == 0: dqn_loss, _ = agent.update_dqn(episode_no) if dqn_loss is not None: running_avg_dqn_loss.push(dqn_loss) if step_no == agent.max_nb_steps_per_episode - 1: # terminate the game because DQN requires one extra step dones = [True for _ in dones] step_in_total += 1 still_running = [1.0 - float(item) for item in prev_step_dones] # list of float prev_step_dones = dones step_rewards = [ float(curr) - float(prev) for curr, prev in zip(scores, prev_rewards) ] # list of float game_points.append(copy.copy(step_rewards)) if agent.use_negative_reward: step_rewards = [ -1.0 if _lost else r for r, _lost in zip(step_rewards, infos["has_lost"]) ] # list of float step_rewards = [ 5.0 if _won else r for r, _won in zip(step_rewards, infos["has_won"]) ] # list of float prev_rewards = scores if agent.fully_observable_graph: step_graph_rewards = [0.0 for _ in range(batch_size)] else: step_graph_rewards = agent.get_graph_rewards( prev_triplets, current_triplets) # list of float step_graph_rewards = [ r * float(m) for r, m in zip(step_graph_rewards, has_not_seen) ] # counting bonus if agent.count_reward_lambda > 0: step_revisit_counting_rewards = agent.get_binarized_count( observation_for_counting, update=True) step_revisit_counting_rewards = [ r * agent.count_reward_lambda for r in step_revisit_counting_rewards ] else: step_revisit_counting_rewards = [ 0.0 for _ in range(batch_size) ] still_running_mask.append(still_running) game_rewards.append(step_rewards) graph_rewards.append(step_graph_rewards) count_rewards.append(step_revisit_counting_rewards) print_actions.append( chosen_actions_before_parsing[0] if still_running[0] else "--") # if all ended, break if np.sum(still_running) == 0: break still_running_mask_np = np.array(still_running_mask) game_rewards_np = np.array( game_rewards) * still_running_mask_np # step x batch game_points_np = np.array( game_points) * still_running_mask_np # step x batch graph_rewards_np = np.array( graph_rewards) * still_running_mask_np # step x batch count_rewards_np = np.array( count_rewards) * still_running_mask_np # step x batch if agent.graph_reward_lambda > 0.0: graph_rewards_pt = generic.to_pt(graph_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: graph_rewards_pt = generic.to_pt(np.zeros_like(graph_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch if agent.count_reward_lambda > 0.0: count_rewards_pt = generic.to_pt(count_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch else: count_rewards_pt = generic.to_pt(np.zeros_like(count_rewards_np), enable_cuda=agent.use_cuda, type='float') # step x batch command_rewards_pt = generic.to_pt(game_rewards_np, enable_cuda=agent.use_cuda, type='float') # step x batch # push experience into replay buffer (dqn) avg_rewards_in_buffer = agent.dqn_memory.avg_rewards() for b in range(game_rewards_np.shape[1]): if still_running_mask_np.shape[ 0] == agent.max_nb_steps_per_episode and still_running_mask_np[ -1][b] != 0: # need to pad one transition _need_pad = True tmp_game_rewards = game_rewards_np[:, b].tolist() + [0.0] else: _need_pad = False tmp_game_rewards = game_rewards_np[:, b] if np.mean( tmp_game_rewards ) < avg_rewards_in_buffer * agent.buffer_reward_threshold: continue for i in range(game_rewards_np.shape[0]): observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[ i] is_final = True if still_running_mask_np[i][b] != 0: is_final = False agent.dqn_memory.add( observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], _triplets[b], command_rewards_pt[i][b], graph_rewards_pt[i][b], count_rewards_pt[i][b], is_final) if still_running_mask_np[i][b] == 0: break if _need_pad: observation_strings, action_candidate_list, chosen_indices, _triplets, prev_action_strings = transition_cache[ -1] agent.dqn_memory.add(observation_strings[b], prev_action_strings[b], action_candidate_list[b], chosen_indices[b], _triplets[b], command_rewards_pt[-1][b] * 0.0, graph_rewards_pt[-1][b] * 0.0, count_rewards_pt[-1][b] * 0.0, True) for b in range(batch_size): running_avg_game_points.push(np.sum(game_points_np, 0)[b]) game_max_score_np = np.array(game_max_score_list, dtype="float32") running_avg_game_points_normalized.push( (np.sum(game_points_np, 0) / game_max_score_np)[b]) running_avg_game_steps.push(np.sum(still_running_mask_np, 0)[b]) running_avg_game_rewards.push(np.sum(game_rewards_np, 0)[b]) running_avg_graph_rewards.push(np.sum(graph_rewards_np, 0)[b]) running_avg_count_rewards.push(np.sum(count_rewards_np, 0)[b]) # finish game agent.finish_of_episode(episode_no, batch_size) episode_no += batch_size if episode_no < agent.learn_start_from_this_episode: continue if agent.report_frequency == 0 or ( episode_no % agent.report_frequency > (episode_no - batch_size) % agent.report_frequency): continue time_2 = datetime.datetime.now() print( "Episode: {:3d} | time spent: {:s} | dqn loss: {:2.3f} | game points: {:2.3f} | normalized game points: {:2.3f} | game rewards: {:2.3f} | graph rewards: {:2.3f} | count rewards: {:2.3f} | used steps: {:2.3f}" .format(episode_no, str(time_2 - time_1).rsplit(".")[0], running_avg_dqn_loss.get_avg(), running_avg_game_points.get_avg(), running_avg_game_points_normalized.get_avg(), running_avg_game_rewards.get_avg(), running_avg_graph_rewards.get_avg(), running_avg_count_rewards.get_avg(), running_avg_game_steps.get_avg())) print(game_name_list[0] + ": " + " | ".join(print_actions)) # evaluate curr_train_performance = running_avg_game_points_normalized.get_avg() eval_game_points, eval_game_points_normalized, eval_game_step = 0.0, 0.0, 0.0 eval_command_generation_f1 = 0.0 if agent.run_eval: eval_game_points, eval_game_points_normalized, eval_game_step, eval_command_generation_f1, detailed_scores = evaluate.evaluate_belief_mode( eval_env, agent, num_eval_game) curr_eval_performance = eval_game_points_normalized curr_performance = curr_eval_performance if curr_eval_performance > best_eval_performance_so_far: best_eval_performance_so_far = curr_eval_performance agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") elif curr_eval_performance == best_eval_performance_so_far: if curr_eval_performance > 0.0: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: curr_eval_performance = 0.0 detailed_scores = "" curr_performance = curr_train_performance if curr_train_performance >= best_train_performance_so_far: agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") # update best train performance if curr_train_performance >= best_train_performance_so_far: best_train_performance_so_far = curr_train_performance if prev_performance <= curr_performance: i_am_patient = 0 else: i_am_patient += 1 prev_performance = curr_performance # if patient >= patience, resume from checkpoint if agent.patience > 0 and i_am_patient >= agent.patience: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('reload from a good checkpoint...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) agent.update_target_net() i_am_patient = 0 if running_avg_game_points_normalized.get_avg() >= 0.95: perfect_training += 1 else: perfect_training = 0 # plot using visdom if config["general"]["visdom"]: viz_game_rewards.append(running_avg_game_rewards.get_avg()) viz_game_points.append(running_avg_game_points.get_avg()) viz_game_points_normalized.append( running_avg_game_points_normalized.get_avg()) viz_graph_rewards.append(running_avg_graph_rewards.get_avg()) viz_count_rewards.append(running_avg_count_rewards.get_avg()) viz_step.append(running_avg_game_steps.get_avg()) viz_dqn_loss.append(running_avg_dqn_loss.get_avg()) viz_eval_game_points.append(eval_game_points) viz_eval_game_points_normalized.append(eval_game_points_normalized) viz_eval_step.append(eval_game_step) viz_x = np.arange(len(viz_game_rewards)).tolist() if reward_win is None: reward_win = viz.line(X=viz_x, Y=viz_game_rewards, opts=dict(title=agent.experiment_tag + "_game_rewards"), name="game_rewards") viz.line(X=viz_x, Y=viz_graph_rewards, opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=viz_x, Y=viz_count_rewards, opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=viz_x, Y=viz_game_points, opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=viz_x, Y=viz_game_points_normalized, opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") else: viz.line(X=[len(viz_game_rewards) - 1], Y=[viz_game_rewards[-1]], opts=dict(title=agent.experiment_tag + "_game_rewards"), win=reward_win, update='append', name="game_rewards") viz.line(X=[len(viz_graph_rewards) - 1], Y=[viz_graph_rewards[-1]], opts=dict(title=agent.experiment_tag + "_graph_rewards"), win=reward_win, update='append', name="graph_rewards") viz.line(X=[len(viz_count_rewards) - 1], Y=[viz_count_rewards[-1]], opts=dict(title=agent.experiment_tag + "_count_rewards"), win=reward_win, update='append', name="count_rewards") viz.line(X=[len(viz_game_points) - 1], Y=[viz_game_points[-1]], opts=dict(title=agent.experiment_tag + "_game_points"), win=reward_win, update='append', name="game_points") viz.line(X=[len(viz_game_points_normalized) - 1], Y=[viz_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_game_points_normalized"), win=reward_win, update='append', name="game_points_normalized") if step_win is None: step_win = viz.line(X=viz_x, Y=viz_step, opts=dict(title=agent.experiment_tag + "_step"), name="step") else: viz.line(X=[len(viz_step) - 1], Y=[viz_step[-1]], opts=dict(title=agent.experiment_tag + "_step"), win=step_win, update='append', name="step") if dqn_loss_win is None: dqn_loss_win = viz.line(X=viz_x, Y=viz_dqn_loss, opts=dict(title=agent.experiment_tag + "_dqn_loss"), name="dqn loss") else: viz.line(X=[len(viz_dqn_loss) - 1], Y=[viz_dqn_loss[-1]], opts=dict(title=agent.experiment_tag + "_dqn_loss"), win=dqn_loss_win, update='append', name="dqn loss") if eval_game_points_win is None: eval_game_points_win = viz.line( X=viz_x, Y=viz_eval_game_points, opts=dict(title=agent.experiment_tag + "_eval_game_points"), name="eval game points") viz.line(X=viz_x, Y=viz_eval_game_points_normalized, opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") else: viz.line(X=[len(viz_eval_game_points) - 1], Y=[viz_eval_game_points[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points"), win=eval_game_points_win, update='append', name="eval game_points") viz.line(X=[len(viz_eval_game_points_normalized) - 1], Y=[viz_eval_game_points_normalized[-1]], opts=dict(title=agent.experiment_tag + "_eval_game_points_normalized"), win=eval_game_points_win, update='append', name="eval_game_points_normalized") if eval_step_win is None: eval_step_win = viz.line(X=viz_x, Y=viz_eval_step, opts=dict(title=agent.experiment_tag + "_eval_step"), name="eval step") else: viz.line(X=[len(viz_eval_step) - 1], Y=[viz_eval_step[-1]], opts=dict(title=agent.experiment_tag + "_eval_step"), win=eval_step_win, update='append', name="eval step") # write accuracies down into file _s = json.dumps({ "time spent": str(time_2 - time_1).rsplit(".")[0], "dqn loss": str(running_avg_dqn_loss.get_avg()), "train game points": str(running_avg_game_points.get_avg()), "train normalized game points": str(running_avg_game_points_normalized.get_avg()), "train game rewards": str(running_avg_game_rewards.get_avg()), "train graph rewards": str(running_avg_graph_rewards.get_avg()), "train count rewards": str(running_avg_count_rewards.get_avg()), "train steps": str(running_avg_game_steps.get_avg()), "eval game points": str(eval_game_points), "eval normalized game points": str(eval_game_points_normalized), "eval command generation f1": str(eval_command_generation_f1), "eval steps": str(eval_game_step), "detailed scores": detailed_scores }) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() if curr_performance == 1.0 and curr_train_performance >= 0.95: break if perfect_training >= 3: break
def train(): time_1 = datetime.datetime.now() config = generic.load_config() env = ObservationGenerationData(config) env.split_reset("train") agent = Agent(config) agent.zero_noise() ave_train_loss = generic.HistoryScoreCache(capacity=500) # visdom if config["general"]["visdom"]: import visdom viz = visdom.Visdom() plt_win = None eval_plt_win = None viz_loss, viz_eval_loss, viz_eval_f1 = [], [], [] episode_no = 0 batch_no = 0 output_dir = "." data_dir = "." json_file_name = agent.experiment_tag.replace(" ", "_") best_eval_loss_so_far, best_training_loss_so_far = 10000.0, 10000.0 # load model from checkpoint if agent.load_pretrained: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) elif os.path.exists(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt"): agent.load_pretrained_model(data_dir + "/" + agent.load_graph_generation_model_from_tag + ".pt", load_partial_graph=False) try: while(True): if episode_no > agent.max_episode: break agent.train() observation_strings, prev_action_strings = env.get_batch() curr_batch_size = len(observation_strings) lens = [len(elem) for elem in observation_strings] max_len = max(lens) padded_observation_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in observation_strings] padded_prev_action_strings = [elem + ["<pad>"]*(max_len - len(elem)) for elem in prev_action_strings] masks = torch.zeros((curr_batch_size, max_len), dtype=torch.float).cuda() if agent.use_cuda else torch.zeros((curr_batch_size, max_len), dtype=torch.float) for i in range(curr_batch_size): masks[i, :lens[i]] = 1 preds_last_batch = [] last_k_batches_loss = [] prev_h = None for i in range(max_len): batch_obs_string = [elem[i] for elem in padded_observation_strings] batch_prev_action_string = [elem[i] for elem in padded_prev_action_strings] loss, pred, prev_h = agent.observation_generation_teacher_force(batch_obs_string, batch_prev_action_string, masks[:, i], prev_h) last_k_batches_loss.append(loss) ave_train_loss.push(generic.to_np(loss)) preds_last_batch.append(pred[-1]) if ((i + 1) % agent.backprop_frequency == 0 or i == max_len - 1): # and i > 0: agent.optimizer.zero_grad() ave_k_loss = torch.mean(torch.stack(last_k_batches_loss)) ave_k_loss.backward() agent.optimizer.step() last_k_batches_loss = [] prev_h = prev_h.detach() k = 0 ep_string = [] while(masks[-1][k] > 0): step_string = [] regen_strings = preds_last_batch[k].argmax(-1) for l in range(len(regen_strings)): step_string.append(agent.word_vocab[regen_strings[l]]) ep_string.append((' '.join(step_string).split("<eos>")[0])) k += 1 if k == len(masks[-1]): break if len(ep_string) >= 3: print(' | '.join(ep_string[:3])) ##### # lr schedule # learning_rate = 1.0 * (generic.power(agent.model.block_hidden_dim, -0.5) * min(generic.power(batch_no, -0.5), batch_no * generic.power(agent.learning_rate_warmup_until, -1.5))) if batch_no < agent.learning_rate_warmup_until: cr = agent.init_learning_rate / math.log2(agent.learning_rate_warmup_until) learning_rate = cr * math.log2(batch_no + 1) else: learning_rate = agent.init_learning_rate for param_group in agent.optimizer.param_groups: param_group['lr'] = learning_rate episode_no += curr_batch_size batch_no += 1 time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], ave_train_loss.get_avg())) if agent.report_frequency == 0 or (episode_no % agent.report_frequency > (episode_no - curr_batch_size) % agent.report_frequency): continue eval_loss, eval_f1 = 0.0, 0.0 if episode_no % agent.report_frequency <= (episode_no - curr_batch_size) % agent.report_frequency: if agent.run_eval: eval_loss = evaluate.evaluate_observation_generation_loss(env, agent, "valid") eval_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "valid") env.split_reset("train") # if run eval, then save model by eval accuracy if eval_loss < best_eval_loss_so_far: best_eval_loss_so_far = eval_loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") else: if loss < best_training_loss_so_far: best_training_loss_so_far = loss agent.save_model_to_path(output_dir + "/" + agent.experiment_tag + "_model.pt") time_2 = datetime.datetime.now() print("Episode: {:3d} | time spent: {:s} | loss: {:2.3f} | valid loss: {:2.3f} | valid f1: {:2.3f}".format(episode_no, str(time_2 - time_1).rsplit(".")[0], loss, eval_loss, eval_f1)) # plot using visdom if config["general"]["visdom"]: viz_loss.append(ave_train_loss.get_avg()) viz_eval_loss.append(eval_loss) viz_eval_f1.append(eval_f1) viz_x = np.arange(len(viz_loss)).tolist() if plt_win is None: plt_win = viz.line(X=viz_x, Y=viz_loss, opts=dict(title=agent.experiment_tag + "_loss"), name="training loss") viz.line(X=viz_x, Y=viz_eval_loss, opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") else: viz.line(X=[len(viz_loss) - 1], Y=[viz_loss[-1]], opts=dict(title=agent.experiment_tag + "_loss"), win=plt_win, update='append', name="training loss") viz.line(X=[len(viz_eval_loss) - 1], Y=[viz_eval_loss[-1]], opts=dict(title=agent.experiment_tag + "_eval_loss"), win=plt_win, update='append', name="eval loss") if eval_plt_win is None: eval_plt_win = viz.line(X=viz_x, Y=viz_eval_f1, opts=dict(title=agent.experiment_tag + "_eval_f1"), name="eval f1") else: viz.line(X=[len(viz_eval_f1) - 1], Y=[viz_eval_f1[-1]], opts=dict(title=agent.experiment_tag + "_eval_f1"), win=eval_plt_win, update='append', name="eval f1") # write accuracies down into file _s = json.dumps({"time spent": str(time_2 - time_1).rsplit(".")[0], "loss": str(ave_train_loss.get_avg()), "eval loss": str(eval_loss), "eval f1": str(eval_f1)}) with open(output_dir + "/" + json_file_name + '.json', 'a+') as outfile: outfile.write(_s + '\n') outfile.flush() # At any point you can hit Ctrl + C to break out of training early. except KeyboardInterrupt: print('--------------------------------------------') print('Exiting from training early...') if agent.run_eval: if os.path.exists(output_dir + "/" + agent.experiment_tag + "_model.pt"): print('Evaluating on test set and saving log...') agent.load_pretrained_model(output_dir + "/" + agent.experiment_tag + "_model.pt", load_partial_graph=False) test_loss = evaluate.evaluate_observation_generation_loss(env, agent, "test") test_f1 = evaluate.evaluate_observation_generation_free_generation(env, agent, "test") print(test_loss, test_f1)