def debug_manual_control(self, data_point, vocab): self.server.clear_metadata() task_completion_accuracy = 0 image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 print("Instruction is ", " ".join([vocab[index] for index in data_point.instruction])) plt.ion() while True: # Show the goal location self.show_goal_location(image, metadata) incorrect_action = True action_string = None while incorrect_action: action_string = input( "Take the action. 0: Forward, 1: Left, 2: Right, 3: Stop, 4: Interact\n" ) if action_string in ['0', '1', '2', '3', '4']: incorrect_action = False if action_string == '4': interact_values = input( "Enter the row and column in format: row col") row, col = interact_values.split() row, col = int(row), int(col) action_string = 4 + row * 32 + col action = int(action_string) action_name = self.action_space.get_action_name(action) if action == self.action_space.get_stop_action_index(): # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) print("Metadata is ", metadata) if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1 print("Metadata is ", metadata) print("Took action %r, Got reward %r" % (action_name, reward))
def test_human_performance(self, dataset, vocab, logger): self.server.clear_metadata() for data_point in dataset: task_completion_accuracy = 0 image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 print("Instruction is ", " ".join([vocab[index] for index in data_point.instruction])) while True: incorrect_action = True action_string = None while incorrect_action: action_string = input( "Take the action. 0: Forward, 1: Left, 2: Right, 3: Stop, 4: Interact\n" ) if action_string in ['0', '1', '2', '3', '4']: incorrect_action = False if action_string == '4': interact_values = input( "Enter the row and column in format: row col") row, col = interact_values.split() row, col = int(row), int(col) action_string = 4 + row * 32 + col action = int(action_string) if action == self.action_space.get_stop_action_index(): # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 logger.log("Completed the task") logger.log("Meta data is %r " % metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1
def test_baseline(self, test_dataset): self.server.clear_metadata() metadata = {"feedback": ""} num_actions_list = [] task_completion_accuracy = 0 for data_point in test_dataset: image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None) num_actions = 0 # max_num_actions = len(data_point.get_trajectory()) # max_num_actions += self.constants["max_extra_horizon"] num_segments = len(data_point.get_instruction_oracle_segmented()) max_num_actions = self.constants["horizon"] * num_segments while True: action = self.get_next_action(data_point, num_actions) if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) num_actions_list.append(num_actions) self.meta_data_util.log_results(metadata) if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action) num_actions += 1 # self._save_agent_state(state, num_actions) self.meta_data_util.log_results(metadata) task_completion_accuracy /= float(max(len(test_dataset), 1)) task_completion_accuracy *= 100.0 mean_num_actions = float(np.array(num_actions_list).mean()) logging.info("Task completion accuracy %r", task_completion_accuracy) logging.info("Done testing baseline %r, mean num actions is %f", self.baseline_name, mean_num_actions)
def debug_human_control(self, data_point, tensorboard=None): image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 max_num_actions = self.constants["horizon"] actions = [] message = "" for action in range(self.action_space.num_actions()): message = message + "%d (%s) " % ( action, self.action_space.get_action_name(action)) + " " while True: # Use test policy to get the action action = input("Take action according to the message: " + str(message)) action = int(action) actions.append(action) if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_scalar("navigation_error", metadata["navigation_error"]) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1 return metadata, actions
def calc_log_prob(self, tune_dataset, tune_image, tensorboard): total_validation_log_probability = 0 for data_point_ix, data_point in enumerate(tune_dataset): tune_image_example = tune_image[data_point_ix] image = tune_image_example[0] model_state = None state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=None, position_orientation=None, data_point=data_point) trajectory = data_point.get_trajectory() validation_log_probability = 0 for action_ix, action in enumerate(trajectory): log_probabilities, model_state, image_emb_seq = self.model.get_probs( state, model_state) validation_log_probability += float( log_probabilities.data[0][action]) image = tune_image_example[action_ix + 1] state = state.update(image, action, pose=None, position_orientation=None, data_point=data_point) log_probabilities, model_state, image_emb_seq = self.model.get_probs( state, model_state) validation_log_probability += float(log_probabilities.data[0][ self.action_space.get_stop_action_index()]) mean_validation_log_probability = validation_log_probability / float( len(trajectory) + 1) tensorboard.log_scalar("Validation_Log_Prob", mean_validation_log_probability) total_validation_log_probability += mean_validation_log_probability total_validation_log_probability /= float(max(len(tune_dataset), 1)) logging.info("Mean Validation Log Prob is %r", total_validation_log_probability)
def _test(self, data_point, tensorboard=None, logger=None): image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None actions = [] total_reward = 0.0 while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: log_probabilities, model_state = self.model.get_probs( state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # Use test policy to get the action action = self.test_policy(probabilities) actions.append(action) if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_scalar("navigation_error", metadata["navigation_error"]) total_reward += reward # Update the scores based on meta_data self.log("StreetView Metadata: %r" % metadata, logger) self.log( "Test Example: Num actions %r, Navigation Error %r, Total Reward %r " % (num_actions, metadata["navigation_error"], total_reward), logger) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) total_reward += reward # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1 return metadata, actions
def test_auto_segmented(self, test_dataset, segmenting_type="oracle", tensorboard=None, logger=None, pushover_logger=None): assert segmenting_type in ("auto", "oracle") self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() self.log( "Performing testing on paragraphs with segmenting type %r" % segmenting_type, logger) metadata = {"feedback": ""} task_completion_accuracy = 0 for data_point in test_dataset: if segmenting_type == "auto": segmented_instruction = data_point.get_instruction_auto_segmented( ) else: segmented_instruction = data_point.get_instruction_oracle_segmented( ) max_num_actions = self.constants["horizon"] image, metadata = self.server.reset_receive_feedback(data_point) for instruction_i, instruction in enumerate(segmented_instruction): pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) # Reset the actions taken and model state num_actions = 0 model_state = None while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp( log_probabilities.data))[0] else: raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Compute the l2 distance intermediate_goal = data_point.get_destination_list( )[instruction_i] agent_position = metadata["x_pos"], metadata["z_pos"] distance = self._l2_distance(agent_position, intermediate_goal) # logging.info("Agent: Position %r got Distance %r " % (instruction_i + 1, distance)) # self.log("Agent: Position %r got Distance %r " % (instruction_i + 1, distance), logger) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions += 1 image, reward, metadata = self.server.halt_and_receive_feedback() if tensorboard is not None: tensorboard.log_all_test_errors(metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 logging.info("Testing data action counts %r", action_counts) task_completion_accuracy = (task_completion_accuracy * 100.0) / float( max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log( "Testing: Task completion accuracy is: %r" % task_completion_accuracy, logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.log("Testing data action counts %r" % action_counts, logger) self.meta_data_util.log_results(metadata, logger) if pushover_logger is not None: pushover_feedback = str(metadata["feedback"]) + \ " --- " + "task_completion_accuracy=%r" % task_completion_accuracy pushover_logger.log(pushover_feedback)
def do_train(self, train_dataset, train_images, train_goal_location, tune_dataset, tune_images, tune_goal_location, experiment_name): """ Perform training """ dataset_size = len(train_dataset) tensorboard = self.tensorboard for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) # Test on tuning data self.test(tune_dataset, tune_images, tune_goal_location, tensorboard=tensorboard) for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) train_images_example = train_images[data_point_ix] goal_location = train_goal_location[data_point_ix] image = train_images_example[0] model_state = None state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=None, position_orientation=data_point.get_start_pos(), data_point=data_point) trajectory = data_point.get_trajectory() traj_len = len(trajectory) if self.only_first: trajectory = trajectory[0:1] batch_replay_items = [] for action_ix, action in enumerate(trajectory): # Sample action using the policy # Generate probabilities over actions volatile = self.model.get_attention_prob( state, model_state) goal = goal_location[action_ix] # Store it in the replay memory list if not self.ignore_none or goal[0] is not None: replay_item = ReplayMemoryItem(state, action, 0, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) if not self.only_first: # Send the action and get feedback image = train_images_example[action_ix + 1] # Update the agent state state = state.update(image, action, pose=None, position_orientation=None, data_point=data_point) # Store it in the replay memory list if not self.only_first: goal = goal_location[traj_len] if not self.ignore_none or goal[0] is not None: volatile = self.model.get_attention_prob( state, model_state) replay_item = ReplayMemoryItem( state, self.action_space.get_stop_action_index(), 0, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) # Perform update if len(batch_replay_items) > 0: loss_val = self.do_update(batch_replay_items) if tensorboard is not None: tensorboard.log_scalar("Loss", loss_val) if self.goal_prediction_loss is not None: goal_prediction_loss = float( self.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) if self.goal_prob is not None: goal_prob = float(self.goal_prob.data[0]) tensorboard.log_scalar("goal_prob", goal_prob) if self.object_detection_loss is not None: object_detection_loss = float( self.object_detection_loss.data[0]) tensorboard.log_scalar("object_detection_loss", object_detection_loss) if self.cross_entropy_loss is not None: cross_entropy_loss = float( self.cross_entropy_loss.data[0]) tensorboard.log_scalar("Cross_entropy_loss", cross_entropy_loss) if self.dist_loss is not None: dist_loss = float(self.dist_loss.data[0]) tensorboard.log_scalar("Dist_loss", dist_loss) # Save the model self.model.save_model(experiment_name + "/goal_prediction_supervised_epoch_" + str(epoch))
def test(self, tune_dataset, tune_image, tune_goal_location, tensorboard): total_validation_loss = 0 total_validation_prob = 0 total_validation_exact_accuracy = 0 total_goal_distance = 0 num_items = 0 # Next metric measures when the goal is visible and prediction is within 10\% radius total_epsilon_accuracy = 0 num_visible_items = 0 for data_point_ix, data_point in enumerate(tune_dataset): tune_image_example = tune_image[data_point_ix] goal_location = tune_goal_location[data_point_ix] image = tune_image_example[0] model_state = None state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=None, position_orientation=data_point.get_start_pos(), data_point=data_point) trajectory = data_point.get_trajectory() if self.only_first: trajectory = trajectory[0:1] traj_len = len(trajectory) num_items_ = 0 sum_loss = 0 sum_prob = 0 sum_acc = 0 sum_dist = 0 for action_ix, action in enumerate(trajectory): state.goal = goal_location[action_ix] volatile = self.model.get_attention_prob(state, model_state) goal = goal_location[action_ix] row, col, _, _ = goal if not self.ignore_none or row is not None: if row is None: gold_ix = self.final_height * self.final_width else: gold_ix = row * self.final_width + col loss, prob, meta = GoalPrediction.get_loss_and_prob( volatile, goal, self.final_height, self.final_width) num_items_ += 1 sum_loss = sum_loss + float(loss.data.cpu().numpy()[0]) sum_prob = sum_prob + float(prob.data.cpu().numpy()[0]) inferred_ix = int( torch.max(volatile["attention_logits"], 0)[1].data.cpu().numpy()[0]) if gold_ix == inferred_ix: sum_acc = sum_acc + 1.0 if row is not None: sum_dist = sum_dist + abs(row - int(round(inferred_ix/self.final_width)))\ + abs(col - int(inferred_ix % self.final_height)) if row is not None: num_visible_items += 1 if self.is_close_enough(inferred_ix, row, col): total_epsilon_accuracy += 1 if not self.only_first: image = tune_image_example[action_ix + 1] state = state.update(image, action, pose=None, position_orientation=None, data_point=data_point) if not self.only_first: state.goal = goal_location[traj_len] volatile = self.model.get_attention_prob(state, model_state) goal = goal_location[traj_len] row, col, _, _ = goal if not self.ignore_none or row is not None: if row is None: gold_ix = self.final_height * self.final_width else: gold_ix = row * self.final_width + col loss, prob, _ = GoalPrediction.get_loss_and_prob( volatile, goal, self.final_height, self.final_width) num_items_ += 1 sum_loss = sum_loss + float(loss.data.cpu().numpy()[0]) sum_prob = sum_prob + float(prob.data.cpu().numpy()[0]) inferred_ix = int( torch.max(volatile["attention_logits"], 0)[1].data.cpu().numpy()[0]) if gold_ix == inferred_ix: sum_acc = sum_acc + 1.0 if row is not None: sum_dist = sum_dist + abs(row - int(round(inferred_ix/self.final_width))) \ + abs(col - int(inferred_ix % self.final_width)) if row is not None: num_visible_items += 1 if self.is_close_enough(inferred_ix, row, col): total_epsilon_accuracy += 1 total_validation_loss += sum_loss total_validation_prob += sum_prob total_goal_distance += sum_dist total_validation_exact_accuracy += sum_acc num_items += num_items_ mean_total_goal_distance = total_goal_distance / float( max(num_items, 1)) mean_total_validation_loss = total_validation_loss / float( max(num_items, 1)) mean_total_validation_prob = total_validation_prob / float( max(num_items, 1)) mean_total_validation_accuracy = (total_validation_exact_accuracy * 100.0) / float(max(num_items, 1)) mean_total_epsilon_accuracy = (total_epsilon_accuracy * 100.0) / float( max(num_visible_items, 1)) logging.info( "Mean Test result: L1 Distance is %r, Loss %r, Prob %r, Acc is %r, Epsilon Accuracy is %r" % (mean_total_goal_distance, mean_total_validation_loss, mean_total_validation_prob, mean_total_validation_accuracy, mean_total_epsilon_accuracy)) logging.info( "Num visible items %r, Num Exact Match items is %r, Num epsilon match %r, Num Items is %r " % (num_visible_items, total_validation_exact_accuracy, total_epsilon_accuracy, num_items))
def do_train_(house_id, shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, vocab, use_pushover=False): logger.log("In Training...") launch_k_unity_builds([config["port"]], "./house_" + str(house_id) + "_elmer.x86_64", arg_str="--config ./AssetsHouse/config" + str(house_id) + ".json", cwd="./simulators/house/") logger.log("Launched Builds.") server.initialize_server() logger.log("Server Initialized.") # Test policy test_policy = gp.get_argmax_action if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) logger.log('Created Tensorboard Server.') else: tensorboard = None if use_pushover: pushover_logger = None else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent tmp_agent = TmpHouseAgent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent.") action_counts = [0] * action_space.num_actions() max_epochs = 100000 # constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) if tune_dataset_size > 0: # Test on tuning data tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger) # Create the learner to compute the loss learner = TmpAsynchronousContextualBandit(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # TODO change 2 --- unity launch moved up learner.logger = logger for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" %(data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] max_num_actions += constants["max_extra_horizon"] image, metadata = tmp_agent.server.reset_receive_feedback(data_point) instruction = data_point.get_instruction() # instruction_str = TmpAsynchronousContextualBandit.convert_indices_to_text(instruction, vocab) # print("Instruction str is ", instruction_str) # Pose and Orientation gone TODO change 3 state = AgentObservedState(instruction=instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) state.goal = learner.get_goal(metadata) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # logger.log("Training: Meta Data %r " % metadata) # Sample action using the policy log_probabilities, model_state, image_emb_seq, state_feature = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = tmp_agent.server.send_action_receive_feedback(action) # logger.log("Action is %r, Reward is %r Probability is %r " % (action, reward, probabilities)) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the agent state # Pose and orientation gone, TODO change 4 state = state.update(image, action, data_point=data_point) state.goal = learner.get_goal(metadata) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = tmp_agent.server.halt_and_receive_feedback() total_reward += reward # Store it in the replay memory list if not forced_stop: # logger.log("Action is Stop, Reward is %r Probability is %r " % (reward, probabilities)) replay_item = ReplayMemoryItem(state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32 loss_val = learner.do_update(batch_replay_items) if tensorboard is not None: # cross_entropy = float(learner.cross_entropy.data[0]) # tensorboard.log(cross_entropy, loss_val, 0) tensorboard.log_scalar("loss", loss_val) entropy = float(learner.entropy.data[0])/float(num_actions + 1) tensorboard.log_scalar("entropy", entropy) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar("Abs_objective_to_entropy_ratio", ratio) tensorboard.log_scalar("total_reward", total_reward) tensorboard.log_scalar("mean navigation error", metadata['mean-navigation-error']) if learner.action_prediction_loss is not None: action_prediction_loss = float(learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss(action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float(learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss(temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float(learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss(object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float(learner.symbolic_language_prediction_loss.data[0]) tensorboard.log_scalar("sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float(learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
class Client: """ Client can be in one of the following state: 1. Free and Waiting for new example 2. Waiting to take the next action 3. Waiting to receive the next image and message. Client operates in an automaton following the transitions below: Wait for a new example -> repeat [Take an action -> Wait to receive next image and message ] -> Go back to (1) """ WAITING_FOR_EXAMPLE, WAITING_FOR_ACTION, WAITING_TO_RECEIVE = range(3) def __init__(self, agent, config, constants, action_space, tensorboard, client_ix, batch_replay_items, dagger_beta): self.agent = agent self.config = config self.constants = constants self.action_space = action_space self.tensorboard = tensorboard # Client specific information self.status = Client.WAITING_FOR_EXAMPLE self.client_ix = client_ix self.server = agent.servers[client_ix] self.metadata = None # Datapoint specific variable self.max_num_actions = None self.state = None self.model_state = None self.image_emb_seq = None self.current_data_point = None self.last_action = None self.last_expert_action = None self.last_log_prob = None self.factor_entropy = None self.num_action = 0 self.total_reward = 0 self.forced_stop = False self.batch_replay_items = batch_replay_items self.local_batch_replay_items = [] # Learning algorithm specific variables self.beta = dagger_beta def get_state(self): return self.state def get_status(self): return self.status def get_model_state(self): return self.model_state def update_dagger_beta(self, new_beta): self.beta = new_beta def try_to_progress(self): # If in state (1) or (2) then return immediately if self.status == Client.WAITING_FOR_EXAMPLE or self.status == Client.WAITING_FOR_ACTION: return self.status assert self.status == Client.WAITING_TO_RECEIVE # If in state (3) then see if the message is available. If the message # is available then return to waiting for an action or a new example. if self.state is None: feedback = self.server.receive_reset_feedback_nonblocking() else: feedback = self.server.receive_feedback_nonblocking() if feedback is None: return self.status else: if self.state is None: # assert False, "state should not be none" # Feedback is in response to reset image, metadata = feedback pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) self.state = AgentObservedState( instruction=self.current_data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=self.current_data_point) # Waiting for action self.status = Client.WAITING_FOR_ACTION else: # Feedback is in response to an action image, reward, metadata = feedback self.total_reward += reward # Create a replay item unless it is forced if not self.forced_stop: symbolic_text = nav_drone_symbolic_instructions.get_nav_drone_symbolic_instruction_segment( self.current_data_point) replay_item = ReplayMemoryItem( self.state, self.last_expert_action, reward, log_prob=self.last_log_prob, image_emb_seq=self.image_emb_seq, factor_entropy=self.factor_entropy, text_emb=self.model_state[0], symbolic_text=symbolic_text) self.local_batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) self.state = self.state.update( image, self.last_action, pose=pose, position_orientation=position_orientation, data_point=self.current_data_point) if self.last_action == self.agent.action_space.get_stop_action_index( ): # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) self.__flush_to_global_batch() if self.tensorboard is not None: self.tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) self.status = Client.WAITING_FOR_EXAMPLE else: if self.num_action >= self.max_num_actions: # Send forced stop action and wait to receive self._take_forced_stop() self.status = Client.WAITING_TO_RECEIVE else: # Wait to take another action self.status = Client.WAITING_FOR_ACTION self.metadata = metadata return self.status def accept_new_example(self, data_point, max_num_actions): assert self.status == Client.WAITING_FOR_EXAMPLE self.state = None self.metadata = None self.model_state = None self.image_emb_seq = None self.factor_entropy = None self.max_num_actions = max_num_actions self.server.reset_nonblocking(data_point) self.current_data_point = data_point self.last_action = None self.last_expert_action = None self.last_log_prob = None self.num_action = 0 self.total_reward = 0 self.forced_stop = False self.local_batch_replay_items = [] self.status = Client.WAITING_TO_RECEIVE def __flush_to_global_batch(self): """ Add the batch items to the global memory """ for item in self.local_batch_replay_items: self.batch_replay_items.append(item) self.local_batch_replay_items = [] def get_dagger_reference_action(self): goal_x, goal_z = self.current_data_point.get_destination_list()[-1] action = oracle_policy(self.metadata, goal_x, goal_z, self.current_data_point) return action def generate_dagger_probability(self, log_probabilities): policy_probability = list(torch.exp(log_probabilities.data))[0] reference_action_name = self.get_dagger_reference_action() reference_action = self.action_space.get_action_index( reference_action_name) # Create a mixture policy from the agent policy and the deterministic reference policy num_action_space = self.config["num_actions"] policy = [0] * num_action_space for i in range(0, num_action_space): policy[i] = (1 - self.beta) * policy_probability[i] if i == reference_action: policy[i] += self.beta return policy, reference_action def take_action(self, log_probabilities, new_model_state, image_emb_seq, factor_entropy): assert self.status == Client.WAITING_FOR_ACTION # probability = list(torch.exp(log_probabilities.data))[0] probability, reference_action = self.generate_dagger_probability( log_probabilities) self.model_state = new_model_state self.last_log_prob = log_probabilities self.image_emb_seq = image_emb_seq self.factor_entropy = factor_entropy # Use test policy to get the action self.last_action = gp.sample_action_from_prob(probability) self.last_expert_action = reference_action self.num_action += 1 # if self.metadata["goal_dist"] < 5: # # Add a forced stop action to replay items # imp_weight = float(probability[3]) # reward = 1.0 # print "Added with reward of " + str(reward * imp_weight) # replay_item = ReplayMemoryItem( # self.state, self.agent.action_space.get_stop_action_index(), reward * imp_weight, # log_prob=self.last_log_prob, image_emb_seq=self.image_emb_seq, factor_entropy=self.factor_entropy) # self.batch_replay_items.append(replay_item) if self.last_action == self.agent.action_space.get_stop_action_index(): self.server.halt_nonblocking() else: self.server.send_action_nonblocking(self.last_action) self.status = Client.WAITING_TO_RECEIVE def _take_forced_stop(self): # Use test policy to get the action self.last_action = self.agent.action_space.get_stop_action_index() self.forced_stop = True self.server.halt_nonblocking() self.status = Client.WAITING_TO_RECEIVE
def test_save_oracle_images(self, test_dataset, max_traj_len=None): self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() metadata = {"feedback": ""} for data_point_ix, data_point in enumerate(test_dataset): print("Resetting ", data_point.get_instruction()) image, metadata = self.server.reset_receive_feedback(data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions = 0 trajectory = data_point.get_trajectory() os.mkdir("./synthetic_v2_10k_images/test_images/example_" + str(data_point_ix)) Agent.save_image_and_metadata(image, state, data_point_ix, num_actions) while True: # Use test policy to get the action if num_actions == len(trajectory) or ( max_traj_len is not None and num_actions >= max_traj_len): action = self.action_space.get_stop_action_index() else: action = trajectory[num_actions] if action == self.action_space.get_stop_action_index(): # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions += 1 Agent.save_image_and_metadata(image, state, data_point_ix, num_actions) logging.info("Overall test result: ") self.meta_data_util.log_results(metadata) logging.info("Testing data action counts %r", action_counts)
def test_single_step(self, test_dataset, vocab, goal_type="gold", tensorboard=None, logger=None, pushover_logger=None): """ Perform a single step testing i.e. the goal prediction module is called only once. """ self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 metadata = {"feedback": ""} for data_point_ix, data_point in enumerate(test_dataset): instruction_string = " ".join([vocab[token_id] for token_id in data_point.instruction]) self.log("Instruction is %r " % instruction_string, logger) # Call the navigation model image, metadata = self.server.reset_receive_feedback(data_point) if goal_type == "inferred": # Get the panorama and set tracking self._explore_and_set_tracking(data_point, data_point_ix, instruction_string) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) state.goal = self.get_goal(metadata, goal_type) num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None while True: # Generate probabilities over actions if isinstance(self.navigation_model, AbstractModel): probabilities = list(torch.exp(self.navigation_model.get_probs(state).data)) elif isinstance(self.navigation_model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.navigation_model.get_probs(state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: log_probabilities, model_state = self.navigation_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if action == self.action_space.get_stop_action_index() or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback() # if tensorboard is not None: # tensorboard.log_all_test_errors( # metadata["edit_dist_error"], # metadata["closest_dist_error"], # metadata["stop_dist_error"]) # self.log("Testing: Taking stop action and got reward %r " % reward, logger) if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 # Update the scores based on meta_data # self.meta_data_util.log_results(metadata, logger) self.log("Overall test results: %r " % metadata, logger) ############################################# # Take a dummy manipulation action # row, col, row_real, col_real = state.goal # if row is not None and col is not None: # act_name = "interact %r %r" % (row, col) # interact_action = self.action_space.get_action_index(act_name) # image, reward, metadata = self.server.send_action_receive_feedback(interact_action) ############################################# break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback(action) # self.log("Testing: Taking action %r and got reward %r " % (action, reward), logger) # time.sleep(0.5) # Update the agent state state = state.update(image, action, data_point=data_point) state.goal = self.get_goal(metadata, goal_type) num_actions += 1 task_completion_accuracy = (task_completion_accuracy * 100.0)/float(max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.log("Testing: Task Completion Accuracy: %r " % task_completion_accuracy, logger) # self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str(metadata["feedback"]) pushover_logger.log(pushover_feedback)
class Client: """ Client can be in one of the following state: 1. Free and Waiting for new example 2. Waiting to take the next action 3. Waiting to receive the next image and message. Client operates in an automaton following the transitions below: Wait for a new example -> repeat [Take an action -> Wait to receive next image and message ] -> Go back to (1) """ WAITING_FOR_EXAMPLE, WAITING_FOR_ACTION, WAITING_TO_RECEIVE = range(3) def __init__(self, agent, config, constants, tensorboard, client_ix, batch_replay_items): self.agent = agent self.config = config self.constants = constants self.tensorboard = tensorboard # Client specific information self.status = Client.WAITING_FOR_EXAMPLE self.client_ix = client_ix self.server = agent.server # agent.servers[client_ix] self.metadata = None # Datapoint specific variable self.max_num_actions = None self.state = None self.model_state = None self.image_emb_seq = None self.current_data_point = None self.last_action = None self.last_log_prob = None self.factor_entropy = None self.num_action = 0 self.total_reward = 0 self.forced_stop = False self.batch_replay_items = batch_replay_items def get_state(self): return self.state def get_status(self): return self.status def get_model_state(self): return self.model_state def _get_all_rewards(self, metadata): rewards = [] for i in range(0, self.config["num_actions"]): reward = metadata["reward_dict"][ self.agent.action_space.get_action_name(i)] rewards.append(reward) return rewards def try_to_progress(self): # If in state (1) or (2) then return immediately if self.status == Client.WAITING_FOR_EXAMPLE or self.status == Client.WAITING_FOR_ACTION: return self.status assert self.status == Client.WAITING_TO_RECEIVE # If in state (3) then see if the message is available. If the message # is available then return to waiting for an action or a new example. if self.state is None: feedback = self.server.receive_reset_feedback_nonblocking() else: feedback = self.server.receive_feedback_nonblocking() if feedback is None: return self.status else: if self.state is None: # assert False, "state should not be none" # Feedback is in response to reset image, metadata = feedback pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) self.state = AgentObservedState( instruction=self.current_data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=self.current_data_point) # Waiting for action self.status = Client.WAITING_FOR_ACTION else: # Feedback is in response to an action image, reward, metadata = feedback self.total_reward += reward # Create a replay item unless it is forced if not self.forced_stop: all_rewards = self._get_all_rewards(metadata) replay_item = ReplayMemoryItem( self.state, self.last_action, reward, log_prob=self.last_log_prob, image_emb_seq=self.image_emb_seq, factor_entropy=self.factor_entropy, all_rewards=all_rewards) self.batch_replay_items.append(replay_item) # Update the agent state self.state = self.state.update( image, self.last_action, data_point=self.current_data_point) if self.last_action == self.agent.action_space.get_stop_action_index( ): # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) if self.tensorboard is not None: self.tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) self.status = Client.WAITING_FOR_EXAMPLE else: if self.num_action >= self.max_num_actions: # Send forced stop action and wait to receive self._take_forced_stop() self.status = Client.WAITING_TO_RECEIVE else: # Wait to take another action self.status = Client.WAITING_FOR_ACTION self.metadata = metadata return self.status def accept_new_example(self, data_point, max_num_actions): assert self.status == Client.WAITING_FOR_EXAMPLE self.state = None self.metadata = None self.model_state = None self.image_emb_seq = None self.factor_entropy = None self.max_num_actions = max_num_actions self.server.reset_nonblocking(data_point) self.current_data_point = data_point self.last_action = None self.last_log_prob = None self.num_action = 0 self.total_reward = 0 self.forced_stop = False self.status = Client.WAITING_TO_RECEIVE def take_action(self, log_probabilities, new_model_state, image_emb_seq, factor_entropy): assert self.status == Client.WAITING_FOR_ACTION probability = list(torch.exp(log_probabilities.data))[0] self.model_state = new_model_state self.last_log_prob = log_probabilities self.image_emb_seq = image_emb_seq self.factor_entropy = factor_entropy # Use test policy to get the action self.last_action = gp.sample_action_from_prob(probability) self.num_action += 1 # if self.metadata["goal_dist"] < 5: # # Add a forced stop action to replay items # imp_weight = float(probability[3]) # reward = 1.0 # replay_item = ReplayMemoryItem( # self.state, self.agent.action_space.get_stop_action_index(), reward * imp_weight, # log_prob=self.last_log_prob, image_emb_seq=self.image_emb_seq, factor_entropy=self.factor_entropy) # self.batch_replay_items.append(replay_item) if self.last_action == self.agent.action_space.get_stop_action_index(): self.server.halt_nonblocking() else: self.server.send_action_nonblocking(self.last_action) self.status = Client.WAITING_TO_RECEIVE def reset_datapoint_blocking(self, datapoint): """ Resets to the given datapoint and returns starting image """ image, metadata = self.server.reset_receive_feedback(datapoint) return image, metadata def take_action_blocking(self, action): """ Takes an action and returns image, reward and metadata """ if action == self.agent.action_space.get_stop_action_index(): image, reward, metadata = self.server.halt_and_receive_feedback() done = True else: image, reward, metadata = self.server.send_action_receive_feedback( action) done = False return image, reward, metadata, done def _take_forced_stop(self): # Use test policy to get the action self.last_action = self.agent.action_space.get_stop_action_index() self.forced_stop = True self.server.halt_nonblocking() self.status = Client.WAITING_TO_RECEIVE
def test(self, test_dataset, vocab, tensorboard=None, logger=None, pushover_logger=None): self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 metadata = {"feedback": ""} sum_bisk_metric = 0 for data_point_ix, data_point in enumerate(test_dataset): image, metadata = self.server.reset_receive_feedback(data_point) sum_bisk_metric += metadata["metric"] instruction = self.convert_text_to_indices(metadata["instruction"], vocab) state = AgentObservedState(instruction=instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) # state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices() num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: # print "Num action is " + str(num_actions) + " and max is " + str(max_num_actions) log_probabilities, model_state = self.model.get_probs( state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) # if tensorboard is not None: # tensorboard.log_all_test_errors( # metadata["edit_dist_error"], # metadata["closest_dist_error"], # metadata["stop_dist_error"]) # if metadata["stop_dist_error"] < 5.0: # task_completion_accuracy += 1 # Update the scores based on meta_data # self.meta_data_util.log_results(metadata, logger) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1 self.log("Overall test results:", logger) self.log( "Mean Bisk Metric %r" % (sum_bisk_metric / float(len(test_dataset))), logger) # self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) # self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str(metadata["feedback"]) pushover_logger.log(pushover_feedback)
def _test(self, data_point_ix, data_point, test_image, tensorboard=None, debug=False): image, metadata = self.server.reset_receive_feedback(data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) # Test image if test_image is None: test_image_example = self.get_exploration_image() else: test_image_example = test_image[data_point_ix][0] # Predict the goal predicted_goal, predictor_error, predicted_pixel, attention_prob = self.get_3d_location( test_image_example, data_point, panaroma=True) current_bot_location = metadata["x_pos"], metadata["z_pos"] current_bot_pose = metadata["y_angle"] state.goal = PredictorPlannerAgent.get_goal_location( current_bot_location, current_bot_pose, predicted_goal, 32, 32) print("Predicted Error ", predictor_error) num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None actions = [] info = dict() while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) actions.append(action) if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_all_test_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) if debug: # Dictionary to contain key results info["instruction_string"] = instruction_to_string( data_point.instruction, self.config) info["datapoint_id"] = data_point.get_scene_name() info["stop_dist_error"] = metadata["stop_dist_error"] info["closest_dist_error"] = metadata["closest_dist_error"] info["edit_dist_error"] = metadata["edit_dist_error"] info["num_actions_taken"] = num_actions info["predicted_goal"] = predicted_goal info["predicted_error"] = predictor_error info["gold_goal"] = data_point.get_destination_list()[-1] info["final_location"] = (metadata["x_pos"], metadata["z_pos"]) info["predicted_screen_pixels"] = predicted_pixel self.save_attention_prob(test_image_example, attention_prob, info["instruction_string"], info["datapoint_id"]) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update(image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) # Set the goal based on the current position and angle current_bot_location = metadata["x_pos"], metadata["z_pos"] current_bot_pose = metadata["y_angle"] state.goal = PredictorPlannerAgent.get_goal_location( current_bot_location, current_bot_pose, predicted_goal, 32, 32) num_actions += 1 # logging.info("Error, Start-Distance, Turn-Angle, %r %r %r", metadata["stop_dist_error"], distance, angle) return metadata, actions, predictor_error, info
def test(self, test_dataset, tensorboard=None, logger=None, pushover_logger=None): self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 print("Reached Test") test_dataset_size = len(test_dataset) metadata = {"feedback": ""} data_point = random.sample(test_dataset, 1)[0] while True: print("Please enter an instruction. For sample see:") # data_point = random.sample(test_dataset, 1)[0] image, metadata = self.server.reset_receive_feedback(data_point) print( "Sample instruction: ", instruction_to_string(data_point.get_instruction(), self.config)) input_instruction = input( "Enter an instruction or enter q to quit ") if input_instruction == "q" or input_instruction == "quit": break input_instruction_ids = self.convert_to_id(input_instruction) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=input_instruction_ids, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) # state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices() num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None # print "Model state is new " while True: time.sleep(0.3) # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: # print "Num action is " + str(num_actions) + " and max is " + str(max_num_actions) log_probabilities, model_state = self.model.get_probs( state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) # DONT FORGET TO REMOVE # action = np.random.randint(0, 2) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_all_test_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 # Update the scores based on meta_data self.meta_data_util.log_results(metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions += 1 print("Finished testing. Now logging.") task_completion_accuracy = (task_completion_accuracy * 100.0) / float( max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log( "Testing: Task completion accuracy is: %r" % task_completion_accuracy, logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str( metadata["feedback"] ) + " --- " + "task_completion_accuracy=%r" % task_completion_accuracy pushover_logger.log(pushover_feedback)
def do_train_(simulator_file, shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, use_pushover=False): # Launch unity launch_k_unity_builds([config["port"]], simulator_file) server.initialize_server() # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: pushover_logger = PushoverLogger(experiment_name) else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") agent = Agent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = AsynchronousContextualBandit(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] + constants[ "max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) state = AgentObservedState(instruction=data_point.instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) meta_data_util.start_state_update_metadata(state, metadata) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # Sample action using the policy log_probabilities, model_state, image_emb_seq, volatile = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities, volatile=volatile) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action, data_point=data_point) meta_data_util.state_update_metadata(state, metadata) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward if tensorboard is not None: meta_data_util.state_update_metadata(tensorboard, metadata) # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities, volatile=volatile) batch_replay_items.append(replay_item) # Perform update if len(batch_replay_items) > 0: loss_val = learner.do_update(batch_replay_items) if tensorboard is not None: entropy = float( learner.entropy.data[0]) / float(num_actions + 1) tensorboard.log_scalar("loss", loss_val) tensorboard.log_scalar("entropy", entropy) tensorboard.log_scalar("total_reward", total_reward) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data agent.test(tune_dataset, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
def test_goal_prediction(self, test_dataset, tensorboard=None, logger=None, pushover_logger=None): self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 sum_loss, count, sum_prob, goal_prob_count = 0, 0, 0, 0 metadata = {"feedback": ""} for data_point_ix, data_point in enumerate(test_dataset): print("Datapoint index ", data_point_ix) image, metadata = self.server.reset_receive_feedback(data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) ################################## state.goal = GoalPrediction.get_goal_location( metadata, data_point, 8, 8) print("Instruction is ", instruction_to_string(data_point.instruction, self.config)) ################################## # state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices() num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None trajectory = data_point.get_trajectory()[0:1] trajectory_len = len(trajectory) while True: if num_actions == trajectory_len: action = self.action_space.get_stop_action_index() else: action = trajectory[num_actions] # Generate probabilities over actions if isinstance(self.model, AbstractModel): raise NotImplementedError() elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, volatile = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] # Compute goal prediction accuracy goal_loss, prob, _ = self.goal_prediction_accuracy( state.goal, volatile) sum_loss += goal_loss count += 1 if prob is not None: sum_prob += prob goal_prob_count += 1 else: raise NotImplementedError() # log_probabilities, model_state = self.model.get_probs(state, model_state) # probabilities = list(torch.exp(log_probabilities.data)) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_all_test_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 # Update the scores based on meta_data self.meta_data_util.log_results(metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) ################################## state.goal = GoalPrediction.get_goal_location( metadata, data_point, 8, 8) ################################## num_actions += 1 print("Finished testing. Now logging.") task_completion_accuracy = (task_completion_accuracy * 100.0) / float( max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log( "Testing: Task completion accuracy is: %r" % task_completion_accuracy, logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.log( "Goal Count %r, Mean Goal Loss %r" % (count, sum_loss / float(count)), logger) self.log( "Goal Prob Count %r, Mean Goal Prob %r" % (goal_prob_count, sum_prob / float(goal_prob_count)), logger) self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str( metadata["feedback"] ) + " --- " + "task_completion_accuracy=%r" % task_completion_accuracy pushover_logger.log(pushover_feedback)
def do_train(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ assert isinstance( agent, ReadPointerAgent ), "This learning algorithm works only with READPointerAgent" dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = dict() action_counts[ReadPointerAgent.READ_MODE] = [0] * 2 action_counts[ReadPointerAgent. ACT_MODE] = [0] * self.action_space.num_actions() # Test on tuning data agent.test(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None) mode = ReadPointerAgent.READ_MODE last_action_was_halt = False instruction = instruction_to_string( data_point.get_instruction(), self.config) print "TRAIN INSTRUCTION: %r" % instruction print "" while True: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state, mode).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[mode][action] += 1 if mode == ReadPointerAgent.READ_MODE: # read mode boundary conditions forced_action = False if not state.are_tokens_left_to_be_read(): # force halt action = 1 forced_action = True elif num_actions >= max_num_actions or last_action_was_halt: # force read action = 0 forced_action = True if not forced_action: # Store reward in the replay memory list reward = self._calc_reward_read_mode(state, action) replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) if action == 0: last_action_was_halt = False state = state.update_on_read() elif action == 1: last_action_was_halt = True mode = ReadPointerAgent.ACT_MODE else: raise AssertionError( "Read mode only supports two actions: read(0) and halt(1). " + "Found " + str(action)) elif mode == ReadPointerAgent.ACT_MODE: # deal with act mode boundary conditions if num_actions >= max_num_actions: forced_stop = True break elif action == agent.action_space.get_stop_action_index( ): if state.are_tokens_left_to_be_read(): reward = self._calc_reward_act_halt(state) # Add to replay memory replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) mode = ReadPointerAgent.READ_MODE last_action_was_halt = True state = state.update_on_act_halt() else: forced_stop = False break else: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action) num_actions += 1 total_reward += reward last_action_was_halt = False else: raise AssertionError( "Mode should be either read or act. Unhandled mode: " + str(mode)) assert mode == ReadPointerAgent.ACT_MODE, "Agent should end on Act Mode" # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 self.tensorboard.log_train_error(metadata["error"]) # Save the model self.model.save_model( experiment_name + "/read_pointer_contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def do_train_(shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, vocab, use_pushover=False): print("In training...") launch_k_unity_builds([config["port"]], "./simulators/house_3_elmer.x86_64") server.initialize_server() print("launched builds") # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: # pushover_logger = PushoverLogger(experiment_name) pushover_logger = None else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") tmp_agent = TmpHouseAgent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = TmpSupervisedLearning(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # TODO change 2 --- unity launch moved up for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) image, metadata = tmp_agent.server.reset_receive_feedback( data_point) # instruction = TmpSupervisedLearning.convert_text_to_indices(metadata["instruction"], vocab) instruction = data_point.get_instruction() # Pose and Orientation gone TODO change 3 state = AgentObservedState(instruction=instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) model_state = None batch_replay_items = [] total_reward = 0 # trajectory = metadata["trajectory"] trajectory = data_point.get_trajectory()[0:300] for action in trajectory: # Sample action using the policy log_probabilities, model_state, image_emb_seq, state_feature = \ local_model.get_probs(state, model_state) # Sample action from the probability action_counts[action] += 1 # Send the action and get feedback image, reward, metadata = tmp_agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the agent state # Pose and orientation gone, TODO change 4 state = state.update(image, action, data_point=data_point) total_reward += reward # Send final STOP action and get feedback # Sample action using the policy log_probabilities, model_state, image_emb_seq, state_feature = \ local_model.get_probs(state, model_state) image, reward, metadata = tmp_agent.server.halt_and_receive_feedback( ) total_reward += reward # if tensorboard is not None: # tensorboard.log_all_train_errors( # metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Store it in the replay memory list replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32 loss_val = learner.do_update(batch_replay_items) # self.action_prediction_loss_calculator.predict_action(batch_replay_items) # del batch_replay_items[:] # in place list clear if tensorboard is not None: # cross_entropy = float(learner.cross_entropy.data[0]) # tensorboard.log(cross_entropy, loss_val, 0) num_actions = len(trajectory) + 1 tensorboard.log_scalar( "loss_val", loss_val) # /float(num_actions)) entropy = float( learner.entropy.data[0]) # /float(num_actions) tensorboard.log_scalar("entropy", entropy) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar( "Abs_objective_to_entropy_ratio", ratio) if learner.action_prediction_loss is not None: action_prediction_loss = float( learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss( action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float( learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss( temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float( learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss( object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float( learner.symbolic_language_prediction_loss. data[0]) tensorboard.log_scalar( "sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float( learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) if learner.mean_factor_entropy is not None: mean_factor_entropy = float( learner.mean_factor_entropy.data[0]) tensorboard.log_factor_entropy_loss( mean_factor_entropy) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data print("Going for testing") tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger) print("Done testing")
def do_train(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = [0] * self.action_space.num_actions() # Test on tuning data agent.test(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) # instruction = instruction_to_string( # data_point.get_instruction(), self.config) # print "TRAIN INSTRUCTION: %r" % instruction # print "" instruction = data_point.get_paragraph_instruction() num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.get_paragraph_instruction(), config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point) state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices( ) forced_stop = True while num_actions < max_num_actions: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == agent.action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] # entropy_val = float(self.entropy.data[0]) # self.tensorboard.log(entropy_val, loss_val, total_reward) cross_entropy = float(self.cross_entropy.data[0]) self.tensorboard.log(cross_entropy, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 if self.tensorboard is not None: self.tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Save the model self.model.save_model(experiment_name + "/contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def test_multi_step_action_types(self, test_dataset, vocab, goal_type=None, tensorboard=None, logger=None, pushover_logger=None): """ Perform a single step testing i.e. the goal prediction module is called only once. """ self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 metadata = {"feedback": ""} text_embedding_model = self.goal_prediction_model.text_module for data_point_ix, data_point in enumerate(test_dataset): instruction_string = " ".join([vocab[token_id] for token_id in data_point.instruction]) self.log("Instruction is %r " % instruction_string, logger) # Call the action type model to determine the number of steps token_indices = self.action_type_model.decoding_from_indices_to_indices(data_point.instruction, text_embedding_model) print("Token indices ", token_indices) assert len(token_indices) <= 5 # Call the navigation model image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 max_num_actions = self.constants["horizon"] num_inner_loop_steps = int(max_num_actions/max(1, len(token_indices))) model_state = None for outer_loop_iter in range(0, len(token_indices)): if goal_type == "inferred": # Get the panorama and set tracking self._explore_and_set_tracking(data_point, data_point_ix, instruction_string) state.goal = self.get_goal(metadata, goal_type) for inner_loop_iter in range(0, num_inner_loop_steps): # Generate probabilities over actions if isinstance(self.navigation_model, AbstractModel): probabilities = list(torch.exp(self.navigation_model.get_probs(state).data)) elif isinstance(self.navigation_model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.navigation_model.get_probs(state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: log_probabilities, model_state = self.navigation_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if token_indices[outer_loop_iter] == 1: print("Performing interaction") row, col, row_real, col_real = state.goal if row is not None and col is not None: act_name = "interact %r %r" % (row, col) interact_action = self.action_space.get_action_index(act_name) image, reward, metadata = self.server.send_action_receive_feedback(interact_action) if action == self.action_space.get_stop_action_index() or num_actions >= max_num_actions: break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback(action) # Update the agent state state = state.update(image, action, data_point=data_point) state.goal = self.get_goal(metadata, goal_type) num_actions += 1 if num_actions >= max_num_actions: break # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback() if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 # Update the scores based on meta_data # self.meta_data_util.log_results(metadata, logger) self.log("Overall test results: %r " % metadata, logger) task_completion_accuracy = (task_completion_accuracy * 100.0)/float(max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.log("Testing: Task Completion Accuracy: %r " % task_completion_accuracy, logger) # self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str(metadata["feedback"]) pushover_logger.log(pushover_feedback)
def do_train_(shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, use_pushover=False): server.initialize_server() # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: pushover_logger = PushoverLogger(experiment_name) else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") agent = Agent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = AsynchronousAdvantageActorGAECritic(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # Launch unity launch_k_unity_builds([config["port"]], "./simulators/NavDroneLinuxBuild.x86_64") for epoch in range(1, max_epochs + 1): learner.epoch = epoch task_completion_accuracy = 0 mean_stop_dist_error = 0 stop_dist_errors = [] for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] + constants[ "max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=config, constants=constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point) state.goal = GoalPrediction.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # Sample action using the policy log_probabilities, model_state, image_emb_seq, volatile = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 # Generate goal if config["do_goal_prediction"]: goal = learner.goal_prediction_calculator.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) else: goal = None if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) state.goal = GoalPrediction.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 mean_stop_dist_error += metadata["stop_dist_error"] stop_dist_errors.append(metadata["stop_dist_error"]) if tensorboard is not None: tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32: loss_val = learner.do_update(batch_replay_items) # self.action_prediction_loss_calculator.predict_action(batch_replay_items) # del batch_replay_items[:] # in place list clear if tensorboard is not None: cross_entropy = float(learner.cross_entropy.data[0]) tensorboard.log(cross_entropy, loss_val, 0) entropy = float( learner.entropy.data[0]) / float(num_actions + 1) v_value_loss_per_step = float( learner.value_loss.data[0]) / float(num_actions + 1) tensorboard.log_scalar("entropy", entropy) tensorboard.log_scalar("total_reward", total_reward) tensorboard.log_scalar("v_value_loss_per_step", v_value_loss_per_step) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar( "Abs_objective_to_entropy_ratio", ratio) if learner.action_prediction_loss is not None: action_prediction_loss = float( learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss( action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float( learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss( temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float( learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss( object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float( learner.symbolic_language_prediction_loss. data[0]) tensorboard.log_scalar( "sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float( learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) mean_stop_dist_error = mean_stop_dist_error / float( len(train_dataset)) task_completion_accuracy = (task_completion_accuracy * 100.0) / float(len(train_dataset)) logger.log("Training: Mean stop distance error %r" % mean_stop_dist_error) logger.log("Training: Task completion accuracy %r " % task_completion_accuracy) bins = range(0, 80, 3) # range of distance histogram, _ = np.histogram(stop_dist_errors, bins) logger.log("Histogram of train errors %r " % histogram) if tune_dataset_size > 0: # Test on tuning data agent.test(tune_dataset, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
def debug_tracking(self, data_point, vocab): self.server.clear_metadata() task_completion_accuracy = 0 image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) num_actions = 0 print("Instruction is ", " ".join([vocab[index] for index in data_point.instruction])) plt.ion() # Get the panoramic image panorama, _ = self.server.explore() # Show the goal location self.show_goal_location(panorama, metadata, size=6) tracking_values = input( "Enter the region, row and column for tracking.") region_ix, row, col = [int(w) for w in tracking_values.split()] if region_ix == 0: camera_ix = 3 elif region_ix == 1: camera_ix = 4 elif region_ix == 2: camera_ix = 5 elif region_ix == 3: camera_ix = 0 elif region_ix == 4: camera_ix = 1 elif region_ix == 5: camera_ix = 2 else: raise AssertionError("Region ix should be in {0, 1, 2, 3, 4, 5}") row_value = row / 32.0 col_value = col / 32.0 self.server.set_tracking(camera_ix, row_value, col_value) input("Tracking done. Enter to continue") while True: # Show the goal location self.show_goal_location(image, metadata, goal_type="inferred", size=1) incorrect_action = True action_string = None while incorrect_action: action_string = input( "Take the action. 0: Forward, 1: Left, 2: Right, 3: Stop, 4: Interact\n" ) if action_string in ['0', '1', '2', '3', '4']: incorrect_action = False if action_string == '4': interact_values = input( "Enter the row and column in format: row col") row, col = interact_values.split() row, col = int(row), int(col) action_string = 4 + row * 32 + col action = int(action_string) action_name = self.action_space.get_action_name(action) if action == self.action_space.get_stop_action_index(): # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) print("Metadata is ", metadata) if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action, data_point=data_point) num_actions += 1 print("Metadata is ", metadata) print("Took action %r, Got reward %r" % (action_name, reward))
def _test(self, data_point, tensorboard=None): image, metadata = self.server.reset_receive_feedback(data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) ################################## state.goal = GoalPrediction.get_goal_location(metadata, data_point, 32, 32) ################################## # state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices() num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None actions = [] ################################### # distance, angle = self.get_angle_distance(metadata, data_point) ################################### while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) actions.append(action) if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) if tensorboard is not None: tensorboard.log_all_test_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update(image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) ################################## state.goal = GoalPrediction.get_goal_location( metadata, data_point, 32, 32) ################################## num_actions += 1 # logging.info("Error, Start-Distance, Turn-Angle, %r %r %r", metadata["stop_dist_error"], distance, angle) return metadata, actions
def do_train_forced_reading(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ assert isinstance( agent, ReadPointerAgent ), "This learning algorithm works only with READPointerAgent" dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = dict() action_counts[ReadPointerAgent.READ_MODE] = [0] * 2 action_counts[ReadPointerAgent. ACT_MODE] = [0] * self.action_space.num_actions() # Test on tuning data agent.test_forced_reading(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) oracle_segments = data_point.get_instruction_oracle_segmented() pose = int(metadata["y_angle"] / 15.0) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose) per_segment_budget = int(max_num_actions / len(oracle_segments)) num_segment_actions = 0 mode = ReadPointerAgent.READ_MODE current_segment_ix = 0 while True: if mode == ReadPointerAgent.READ_MODE: # Find the number of tokens to read for the gold segment num_segment_size = len( oracle_segments[current_segment_ix]) current_segment_ix += 1 for i in range(0, num_segment_size): state = state.update_on_read() mode = ReadPointerAgent.ACT_MODE elif mode == ReadPointerAgent.ACT_MODE: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state, mode).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[mode][action] += 1 # deal with act mode boundary conditions if num_actions >= max_num_actions: forced_stop = True break elif action == agent.action_space.get_stop_action_index( ) or num_segment_actions > per_segment_budget: if state.are_tokens_left_to_be_read(): # reward = self._calc_reward_act_halt(state) if metadata["error"] < 5.0: reward = 1.0 else: reward = -1.0 # Add to replay memory replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) if action == agent.action_space.get_stop_action_index( ): batch_replay_items.append(replay_item) mode = ReadPointerAgent.READ_MODE agent.server.force_goal_update() state = state.update_on_act_halt() num_segment_actions = 0 else: if action == agent.action_space.get_stop_action_index( ): forced_stop = False else: # stopping due to per segment budget exhaustion forced_stop = True break else: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) state = state.update(image, action, pose=pose) num_actions += 1 num_segment_actions += 1 total_reward += reward else: raise AssertionError( "Mode should be either read or act. Unhandled mode: " + str(mode)) assert mode == ReadPointerAgent.ACT_MODE, "Agent should end on Act Mode" # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 self.tensorboard.log_train_error(metadata["error"]) # Save the model self.model.save_model( experiment_name + "/read_pointer_forced_reading_contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def test(self, test_dataset, vocab, tensorboard=None, logger=None, pushover_logger=None): self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() task_completion_accuracy = 0 metadata = {"feedback": ""} for data_point_ix, data_point in enumerate(test_dataset): image, metadata = self.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, data_point=data_point) state.goal = self.get_goal(metadata) # state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices() num_actions = 0 max_num_actions = self.constants["horizon"] model_state = None while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp(log_probabilities.data))[0] else: log_probabilities, model_state = self.model.get_probs( state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: # Send the action and get feedback image, reward, metadata = self.server.halt_and_receive_feedback( ) # if tensorboard is not None: # tensorboard.log_all_test_errors( # metadata["edit_dist_error"], # metadata["closest_dist_error"], # metadata["stop_dist_error"]) # self.log("Testing: Taking stop action and got reward %r " % reward, logger) if metadata["navigation-error"] <= 1.0: task_completion_accuracy += 1 # Update the scores based on meta_data # self.meta_data_util.log_results(metadata, logger) # self.log("Overall test results: %r " % metadata, logger) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # self.log("Testing: Taking action %r and got reward %r " % (action, reward), logger) # time.sleep(0.5) # Update the agent state state = state.update(image, action, data_point=data_point) state.goal = self.get_goal(metadata) num_actions += 1 task_completion_accuracy = (task_completion_accuracy * 100.0) / float( max(len(test_dataset), 1)) self.log("Overall test results:", logger) self.log("Testing: Final Metadata: %r" % metadata, logger) self.log("Testing: Action Distribution: %r" % action_counts, logger) self.log( "Testing: Task Completion Accuracy: %r " % task_completion_accuracy, logger) # self.meta_data_util.log_results(metadata, logger) self.log("Testing data action counts %r" % action_counts, logger) if pushover_logger is not None: pushover_feedback = str(metadata["feedback"]) pushover_logger.log(pushover_feedback)
def test_auto_segmented(self, test_dataset, logger=None, tensorboard=None, segmenting_type="oracle"): assert segmenting_type in ("auto", "oracle") self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() self.log( "Performing testing on paragraphs with segmenting type %r" % segmenting_type, logger) metadata = {"feedback": ""} for data_point in test_dataset: if segmenting_type == "auto": segmented_instruction = data_point.get_instruction_auto_segmented( ) else: segmented_instruction = data_point.get_instruction_oracle_segmented( ) max_num_actions = self.constants["horizon"] image, metadata = self.server.reset_receive_feedback(data_point) for instruction_i, instruction in enumerate(segmented_instruction): pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction()) # Reset the actions taken and model state num_actions = 0 model_state = None # Predict the goal by performing an exploration image and then finding the next suitable place to visit exploration_image, _, _ = self.server.explore() image_slices = [] for img_ctr in range(0, 6): image_slice = exploration_image[ img_ctr * 3:(img_ctr + 1) * 3, :, :] # 3 x height x width # Scale the intensity of the image as done by scipy.misc.imsave image_slice = scipy.misc.bytescale( image_slice.swapaxes(0, 1).swapaxes(1, 2)) image_slices.append(image_slice) # Reorder and horizontally stitch the images reordered_images = [ image_slices[3], image_slices[4], image_slices[5], image_slices[0], image_slices[1], image_slices[2] ] exploration_image = np.hstack(reordered_images).swapaxes( 1, 2).swapaxes(0, 1) # 3 x height x (width*6) start_pos = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) goal_pos = data_point.get_destination_list()[instruction_i] predicted_goal, predictor_error = self.get_3d_location_for_paragraphs( exploration_image, instruction, start_pos, goal_pos, panaroma=True) current_bot_location = metadata["x_pos"], metadata["z_pos"] current_bot_pose = metadata["y_angle"] state.goal = PredictorPlannerAgent.get_goal_location( current_bot_location, current_bot_pose, predicted_goal, 32, 32) print("Predicted Error ", predictor_error) while True: # Generate probabilities over actions if isinstance(self.model, AbstractModel): probabilities = list( torch.exp(self.model.get_probs(state).data)) elif isinstance(self.model, AbstractIncrementalModel): log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state, volatile=True) probabilities = list(torch.exp( log_probabilities.data))[0] else: raise AssertionError("Unhandled Model type.") # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 if action == self.action_space.get_stop_action_index( ) or num_actions >= max_num_actions: intermediate_goal = data_point.get_destination_list( )[instruction_i] agent_position = metadata["x_pos"], metadata["z_pos"] distance = self._l2_distance(agent_position, intermediate_goal) self.log("Instruction is %r " % instruction, logger) self.log( "Predicted Goal is %r, Goal Reached is %r and Real goal is %r " % (predicted_goal, agent_position, intermediate_goal), logger) self.log( "Agent: Position %r got Distance %r " % (instruction_i + 1, distance), logger) break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) # Set the goal based on the current position and angle current_bot_location = metadata["x_pos"], metadata[ "z_pos"] current_bot_pose = metadata["y_angle"] state.goal = PredictorPlannerAgent.get_goal_location( current_bot_location, current_bot_pose, predicted_goal, 32, 32) num_actions += 1 image, reward, metadata = self.server.halt_and_receive_feedback() if tensorboard is not None: tensorboard.log_all_test_errors(metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Update the scores based on meta_data self.meta_data_util.log_results(metadata) self.meta_data_util.log_results(metadata) logging.info("Testing data action counts %r", action_counts)
def test_auto_segmented(self, test_dataset, tensorboard=None, segmenting_type="auto"): assert segmenting_type in ("auto", "oracle") self.server.clear_metadata() action_counts = [0] * self.action_space.num_actions() metadata = "" for data_point in test_dataset: if segmenting_type == "auto": segmented_instruction = data_point.get_instruction_auto_segmented( ) else: segmented_instruction = data_point.get_instruction_oracle_segmented( ) num_segments = len(segmented_instruction) gold_num_actions = len(data_point.get_trajectory()) horizon = gold_num_actions // num_segments horizon += self.constants["max_extra_horizon_auto_segmented"] image, metadata = self.server.reset_receive_feedback(data_point) instruction = instruction_to_string(data_point.get_instruction(), self.config) print("TEST INSTRUCTION: %r" % instruction) print("") for instruction_i, instruction in enumerate(segmented_instruction): state = AgentObservedState( instruction=instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, prev_instruction=data_point.get_prev_instruction(), next_instruction=data_point.get_next_instruction) num_actions = 0 # self._save_agent_state(state, num_actions) while True: # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state).data)) # print "test probs:", probabilities # Use test policy to get the action action = self.test_policy(probabilities) action_counts[action] += 1 # logging.info("Taking action-num=%d horizon=%d action=%s from %s", # num_actions, max_num_actions, str(action), str(probabilities)) if action == self.action_space.get_stop_action_index( ) or num_actions >= horizon: break else: # Send the action and get feedback image, reward, metadata = self.server.send_action_receive_feedback( action) # Update the agent state state = state.update(image, action) num_actions += 1 _, _, metadata = self.server.halt_and_receive_feedback() if tensorboard is not None: tensorboard.log_test_error(metadata["error"]) self.meta_data_util.log_results(metadata) logging.info("Testing data action counts %r", action_counts)