Example #1
0
 def __init__(self,
              run_name="",
              ang_weight=0.33,
              fwd_weight=0.33,
              stop_weight=0.33):
     super(CrossEntropy2d, self).__init__()
     self.softmax = SpatialSoftmax2d()
     self.logsoftmax = SpatialSoftmax2d(log=True)
    def __init__(self, run_name="", model_instance_name="only"):

        super(PVN_Stage2_Keyboard, self).__init__()
        self.model_name = "pvn_stage2"
        self.run_name = run_name
        self.instance_name = model_instance_name
        self.writer = LoggingSummaryWriter(log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.instance_name}")

        self.params_s1 = get_current_parameters()["ModelPVN"]["Stage1"]
        self.params = get_current_parameters()["ModelPVN"]["Stage2"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.spatialsoftmax = SpatialSoftmax2d()
        self.action_loss = ActionLoss()

        self.teleoper = KeyTeleop()

        self.env_id = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0
        self.get_act_start_pose = None
        self.gt_labels = None
Example #3
0
    def __init__(self, run_name=""):

        super(ModelTrajectoryToAction, self).__init__()
        self.model_name = "lsvd_action"
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["ModelPVN"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Common
        # --------------------------------------------------------------------------------------------------------------
        self.map_transform_w_to_s = MapTransformerBase(
            source_map_size=self.params["global_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size=self.params["world_size_px"])

        self.map_transform_r_to_w = MapTransformerBase(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["global_map_size"],
            world_size=self.params["world_size_px"])

        # Output an action given the global semantic map
        if self.params["map_to_action"] == "downsample2":
            self.map_to_action = EgoMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"])

        elif self.params["map_to_action"] == "cropped":
            self.map_to_action = CroppedMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                manual=self.params["manual_rule"],
                path_only=self.params["action_in_path_only"],
                recurrence=self.params["action_recurrence"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.gt_fill_missing = MapBatchFillMissing(
            self.params["local_map_size"], self.params["world_size_px"])

        # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training
        # and finetuned on all timesteps end-to-end
        enable_weight_saving(self.map_to_action,
                             "map_to_action",
                             alwaysfreeze=False,
                             neverfreeze=True)

        self.action_loss = ActionLoss()

        self.env_id = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0
        self.get_act_start_pose = None
        self.gt_labels = None
Example #4
0
    def __init__(self,
                 lingunet_params,
                 prior_channels_in,
                 posterior_channels_in,
                 dual_head=False,
                 compute_prior=True,
                 use_prior=False,
                 oob=False):
        super(RatioPathPredictor, self).__init__()

        self.use_prior = use_prior
        self.prior_img_channels = prior_channels_in
        self.posterior_img_channels = posterior_channels_in
        self.dual_head = dual_head
        self.small_network = lingunet_params.get("small_network")
        self.oob = oob

        if use_prior:
            assert compute_prior, "If we want to use the prior distribution, we should compute it, right?"

        if self.oob:
            lingunet_params["in_channels"] = posterior_channels_in
            self.unet_posterior = Lingunet5OOB(deepcopy(lingunet_params))
            lingunet_params["in_channels"] = prior_channels_in
            self.unet_prior = Lingunet5OOB(deepcopy(lingunet_params))
        elif self.small_network:
            lingunet_params["in_channels"] = posterior_channels_in
            self.unet_posterior = Lingunet5S(deepcopy(lingunet_params))
            lingunet_params["in_channels"] = prior_channels_in
            self.unet_prior = Lingunet5S(deepcopy(lingunet_params))
        elif self.dual_head:
            lingunet_params["in_channels"] = posterior_channels_in
            self.unet_posterior = Lingunet5DualHead(deepcopy(lingunet_params))
            lingunet_params["in_channels"] = prior_channels_in
            self.unet_prior = Lingunet5DualHead(deepcopy(lingunet_params))
        else:
            lingunet_params["in_channels"] = posterior_channels_in
            self.unet_posterior = Lingunet5(deepcopy(lingunet_params))
            lingunet_params["in_channels"] = prior_channels_in
            self.unet_prior = Lingunet5(deepcopy(lingunet_params))

        self.softmax = SpatialSoftmax2d()
        self.norm = nn.InstanceNorm2d(2)
        self.compute_prior = compute_prior

        self.dbg_t = None
Example #5
0
    def __init__(self, prior_channels=32,
                 posterior_channels=32,
                 pred_channels=2,
                 emb_size=120,
                 source_map_size=32,
                 world_size=32,
                 compute_prior=True,
                 use_prior=False,
                 l2=False):
        super(RatioPathPredictor, self).__init__(source_map_size, world_size)

        self.prior_img_channels = prior_channels
        self.posterior_img_channels = posterior_channels
        self.emb_size = emb_size
        self.l2 = l2
        self.use_prior = use_prior

        if use_prior:
            assert compute_prior, "If we want to use the prior distribution, we should compute it, right?"

        self.unet_posterior = Unet5ContextualBneck(
            posterior_channels,
            pred_channels,
            emb_size,
            hc1=48, hb1=24, hc2=128)

        self.unet_prior = Unet5ContextualBneck(
            prior_channels,
            pred_channels,
            1,
            hc1=48, hb1=24, hc2=128)

        self.softmax = SpatialSoftmax2d()
        self.norm = nn.InstanceNorm2d(2)
        self.compute_prior = compute_prior

        #self.map_filter = MapLangSemanticFilter(emb_size, feature_channels, 3)
        self.map_size = source_map_size
        self.world_size = world_size

        self.dbg_t = None
        self.seq = 0
Example #6
0
    def __init__(self,
                 run_name,
                 ignore_lang=False,
                 class_loss=True,
                 ground_loss=True):
        super(ModelTopDownPathGoalPredictor, self).__init__()
        self.run_name = run_name
        self.model_name = "top_down_path_pred_pretrain"
        self.writer = SummaryWriter(log_dir="runs/" + run_name)

        self.ignore_lang = ignore_lang
        self.class_loss = class_loss
        self.ground_loss = ground_loss

        # The feature net extracts the 2D feature map from the input image.
        # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image
        # The output predicted labels are the size of the feature map
        self.feature_net = ResNet13Light(32, down_pad=True)
        self.label_pool = nn.MaxPool2d(8)

        if self.ground_loss:
            self.lang_filter = MapLangSemanticFilter(sentence_embedding_size,
                                                     32, 3)
            self.aux_ground_linear = nn.Linear(3, 2)
            enable_weight_saving(self.lang_filter, "ground_filter")
            enable_weight_saving(self.aux_ground_linear, "ground_aux_linear")

        if RESNET:
            self.unet = ResNetConditional(sentence_embedding_size, 35, 2)
        else:
            unet_c_in = 35 if self.ground_loss else 32
            unet_hc1 = 48 if self.ground_loss else 48
            unet_hb1 = 24 if self.ground_loss else 24
            self.unet = Unet5ContextualBneck(unet_c_in,
                                             2,
                                             sentence_embedding_size,
                                             hc1=unet_hc1,
                                             hb1=unet_hb1,
                                             hc2=128,
                                             split_embedding=splitemb)

        if attention:
            self.sentence_embedding = SentenceEmbeddingSelfAttention(
                word_embedding_size,
                lstm_size,
                sentence_embedding_layers,
                attention_heads=attention_heads)
        else:
            self.sentence_embedding = SentenceEmbeddingSimple(
                word_embedding_size, sentence_embedding_size,
                sentence_embedding_layers)

        self.gather2d = Gather2D()

        if self.class_loss:
            self.aux_class_linear = nn.Linear(32, 64)
            enable_weight_saving(self.aux_class_linear, "class_aux_linear")

        print("Sentence Embedding #Params: ",
              get_n_params(self.sentence_embedding))
        print("U-Net #Params: ", get_n_params(self.unet))
        print("Class auxiliary: ", self.class_loss)
        print("Ground auxiliary: ", self.ground_loss)

        # Enable saving of pre-trained weights
        enable_weight_saving(self.feature_net, "feature_resnet_light")
        enable_weight_saving(self.unet, "unet")
        enable_weight_saving(self.sentence_embedding, "sentence_embedding")

        if NLL:
            #self.mask_loss = nn.BCELoss()
            self.mask_loss = nn.NLLLoss2d()
        elif BCE:
            self.mask_loss = nn.BCEWithLogitsLoss()
        elif CE:
            self.spatialsoftmax = SpatialSoftmax2d()
            self.mask_loss = CrossEntropy2d()
        else:
            self.mask_loss = nn.MSELoss()

        self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True)
        self.epoch_numbers = {"train": 0, "eval": 0}
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.dropout = nn.Dropout(0.5)
        self.dropout2d = nn.Dropout2d(0.5)
        self.dropout3d = nn.Dropout3d(0.5)

        self.viz_images = []
        self.instructions = []
    def __init__(self, run_name="", model_instance_name="only"):

        super(PVN_Stage2_ActorCritic, self).__init__()
        self.model_name = "pvn_stage2_ac"
        self.run_name = run_name
        self.instance_name = model_instance_name
        self.writer = LoggingSummaryWriter(log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.instance_name}")

        self.params_s1 = get_current_parameters()["ModelPVN"]["Stage1"]
        self.params_s2 = get_current_parameters()["ModelPVN"]["Stage2"]
        self.params = get_current_parameters()["ModelPVN"]["ActorCritic"]

        self.oob = self.params_s1.get("clip_observability")
        self.ignore_struct = self.params.get("ignore_structured_input", False)

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.tensor_store = KeyTensorStore()

        # Common
        # --------------------------------------------------------------------------------------------------------------
        # Wrap the Stage1 model so that it becomes invisible to PyTorch, get_parameters etc, and doesn't get optimized

        self.map_transform_w_to_r = MapTransformer(source_map_size=self.params_s1["global_map_size"],
                                                       dest_map_size=self.params_s1["local_map_size"],
                                                       world_size_px=self.params_s1["world_size_px"],
                                                       world_size_m=self.params_s1["world_size_m"])

        self.action_base = PVN_Stage2_RLBase(map_channels=self.params_s2["map_to_act_channels"],
                                             map_struct_channels=self.params_s2["map_structure_channels"],
                                             crop_size=self.params_s2["crop_size"],
                                             map_size=self.params_s1["local_map_size"],
                                             h1=self.params["h1"], h2=self.params["h2"],
                                             structure_h1=self.params["structure_h1"],
                                             obs_dim=self.params["obs_dim"],
                                             name="action")

        self.value_base = PVN_Stage2_RLBase(map_channels=self.params_s2["map_to_act_channels"],
                                            map_struct_channels=self.params_s2["map_structure_channels"],
                                            crop_size=self.params_s2["crop_size"],
                                            map_size=self.params_s1["local_map_size"],
                                            h1=self.params["h1"], h2=self.params["h2"],
                                            structure_h1=self.params["structure_h1"],
                                            obs_dim=self.params["obs_dim"],
                                            name="value")

        self.action_head = PVN_Stage2_ActionHead(h2=self.params["h2"])
        self.value_head = PVN_Stage2_ValueHead(h2=self.params["h2"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.batch_select = MapBatchSelect()
        self.gt_fill_missing = MapBatchFillMissing(
            self.params_s1["local_map_size"],
            self.params_s1["world_size_px"],
            self.params_s1["world_size_m"])

        self.env_id = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0
        self.get_act_start_pose = None
        self.gt_labels = None

        # PPO interface:
        self.is_recurrent = False
Example #8
0
    def __init__(self, run_name="", domain="sim"):

        super(PVN_Stage1_Bidomain_Original, self).__init__()
        self.model_name = "pvn_stage1"
        self.run_name = run_name
        self.domain = domain
        self.writer = LoggingSummaryWriter(
            log_dir=f"{get_logging_dir()}/runs/{run_name}/{self.domain}")
        #self.writer = DummySummaryWriter()

        self.root_params = get_current_parameters()["ModelPVN"]
        self.params = self.root_params["Stage1"]
        self.use_aux = self.root_params["UseAux"]
        self.aux_weights = self.root_params["AuxWeights"]

        if self.params.get("weight_override"):
            aux_weights_override_name = "AuxWeightsRealOverride" if self.domain == "real" else "AuxWeightsSimOverride"
            aux_weights_override = self.root_params.get(
                aux_weights_override_name)
            if aux_weights_override:
                print(
                    f"Overriding auxiliary weights for domain: {self.domain}")
                self.aux_weights = dict_merge(self.aux_weights,
                                              aux_weights_override)

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.tensor_store = KeyTensorStore()
        self.losses = AuxiliaryLosses()

        # Auxiliary Objectives
        self.do_perturb_maps = self.params["perturb_maps"]
        print("Perturbing maps: ", self.do_perturb_maps)

        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.num_feature_channels = self.params[
            "feature_channels"]  # + params["relevance_channels"]
        self.num_map_channels = self.params["pathpred_in_channels"]

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"],
            map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"],
            img_h=self.params["img_h"],
            cam_h_fov=self.params["cam_h_fov"],
            domain=domain,
            img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(
            source_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.add_init_pos_to_coverage = AddDroneInitPosToCoverage(
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"],
            map_size_px=self.params["local_map_size"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        self.map_processor_grounding = LangFilterMapProcessor(
            embed_size=self.params["emb_size"],
            in_channels=self.params["feature_channels"],
            out_channels=self.params["relevance_channels"],
            spatial=False,
            cat_out=False)

        ratio_prior_channels = self.params["feature_channels"]

        # Process the global accumulated map
        self.path_predictor_lingunet = RatioPathPredictor(
            self.params["lingunet"],
            prior_channels_in=self.params["feature_channels"],
            posterior_channels_in=self.params["pathpred_in_channels"],
            dual_head=self.params["predict_confidence"],
            compute_prior=self.params["compute_prior"],
            use_prior=self.params["use_prior_only"],
            oob=self.params["clip_observability"])

        print("UNet Channels: " + str(self.num_map_channels))
        print("Feature Channels: " + str(self.num_feature_channels))

        # TODO:O Verify that config has the same randomization parameters (yaw, pos, etc)
        self.second_transform = self.do_perturb_maps or self.params[
            "predict_in_start_frame"]

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"],
            self.params["emb_layers"], self.params["emb_dropout"])

        self.map_transform_local_to_local = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_global_to_local = MapTransformer(
            source_map_size=self.params["global_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_local_to_global = MapTransformer(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["global_map_size"],
            world_size_px=self.params["world_size_px"],
            world_size_m=self.params["world_size_m"])

        self.map_transform_s_to_p = self.map_transform_local_to_local
        self.map_transform_w_to_s = self.map_transform_global_to_local
        self.map_transform_w_to_r = self.map_transform_global_to_local
        self.map_transform_r_to_s = self.map_transform_local_to_local
        self.map_transform_r_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_w = self.map_transform_local_to_global
        self.map_transform_p_to_r = self.map_transform_local_to_local

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(
            self.params["local_map_size"], self.params["world_size_px"],
            self.params["world_size_m"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.visitation_softmax = VisitationSoftmax()

        #TODO:O Use CroppedMapToActionTriplet in Wrapper as Stage2
        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if self.use_aux["class_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_features",
                                 self.params["feature_channels"],
                                 self.params["num_landmarks"], 0,
                                 "fpv_features", "lm_pos_fpv", "lm_indices"))
        if self.use_aux["grounding_features"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_features",
                                 self.params["relevance_channels"], 2, 0,
                                 "fpv_features_g", "lm_pos_fpv",
                                 "lm_mentioned"))
        if self.use_aux["class_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("class_map", self.params["feature_channels"],
                                 self.params["num_landmarks"], 0, "S_W_select",
                                 "lm_pos_map_select", "lm_indices_select"))
        if self.use_aux["grounding_map"]:
            self.losses.add_auxiliary(
                ClassAuxiliary2D("grounding_map",
                                 self.params["relevance_channels"], 2, 0,
                                 "R_W_select", "lm_pos_map_select",
                                 "lm_mentioned_select"))
        # CoRL model uses alignment-model groundings
        if self.use_aux["lang"]:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.losses.add_auxiliary(
                ClassAuxiliary("lang", self.params["emb_size"], 2,
                               self.params["num_landmarks"], "sentence_embed",
                               "lang_lm_mentioned"))

        if self.use_aux["regularize_map"]:
            self.losses.add_auxiliary(
                FeatureRegularizationAuxiliary2D("regularize_map", "l1",
                                                 "S_W_select"))

        lossfunc = self.params["path_loss_function"]
        if self.params["clip_observability"]:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))
        else:
            self.losses.add_auxiliary(
                PathAuxiliary2D("visitation_dist", lossfunc,
                                self.params["clip_observability"],
                                "log_v_dist_s_select",
                                "v_dist_s_ground_truth_select", "SM_S_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(
            ok_distance=self.params["world_size_px"] * 0.1)
        self.goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_acc_meter = MovingAverageMeter(10)
        self.invisible_goal_acc_meter = MovingAverageMeter(10)
        self.visible_goal_frac_meter = MovingAverageMeter(10)

        self.losses.print_auxiliary_info()

        self.total_goals = 0
        self.correct_goals = 0

        self.env_id = None
        self.env_img = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0

        self.should_save_path_overlays = False
Example #9
0
    def forward(self,
                images,
                states,
                instructions,
                instr_lengths,
                plan=None,
                noisy_start_poses=None,
                start_poses=None,
                firstseg=None,
                select_only=True,
                halfway=False,
                grad_noise=False,
                rl=False):
        """
        :param images: BxCxHxW batch of images (observations)
        :param states: BxK batch of drone states
        :param instructions: BxM LongTensor where M is the maximum length of any instruction
        :param instr_lengths: list of len B of integers, indicating length of each instruction
        :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise
        :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time
        :param start_poses: list of drone start poses (these should be equal in practice)
        :param firstseg: list of booleans indicating True if a new segment starts at that timestep
        :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True)
        :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution
        :return:
        """
        cam_poses = self.cam_poses_from_states(states)
        g_poses = None  # None pose is a placeholder for the canonical global pose.
        self.prof.tick("out")

        self.tensor_store.keep_inputs("fpv", images)

        # Calculate the instruction embedding
        if instructions is not None:
            # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state
            # TODO: There's an assumption here that there's only a single instruction in the batch and it doesn't change
            # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP
            if self.params["ignore_instruction"]:
                # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token
                sent_embeddings = self.sentence_embedding(
                    torch.zeros_like(instructions[0:1, 0:1]),
                    torch.ones_like(instr_lengths[0:1]))
            else:
                sent_embeddings = self.sentence_embedding(
                    instructions[0:1], instr_lengths[0:1])
            self.tensor_store.keep_inputs("sentence_embed", sent_embeddings)
        else:
            sent_embeddings = self.sentence_embedding.get()

        self.prof.tick("embed")

        # Extract and project features onto the egocentric frame for each image
        F_W, M_W = self.img_to_features_w(images,
                                          cam_poses,
                                          sent_embeddings,
                                          self.tensor_store,
                                          show="",
                                          halfway=halfway)

        # For training the critic, this is as far as we need to poceed with the computation.
        # self.img_to_features_w has stored computed feature maps inside the tensor store, which will then be retrieved by the critic
        if halfway == True:  # Warning: halfway must be True not truthy
            return None, None

        self.tensor_store.keep_inputs("F_w", F_W)
        self.tensor_store.keep_inputs("M_w", M_W)
        self.prof.tick("img_to_map_frame")

        # Accumulate the egocentric features in a global map
        reset_mask = firstseg if self.params["clear_history"] else None

        # Consider the space very near the drone and right under it as observed - draw ones on the observability mask
        # If we treat that space as unobserved, then there's going to be a gap in the visitation distribution, which
        # makes training with RL more difficult, as there is no reward feedback if the drone doesn't cross that gap.
        if self.params.get("cover_init_pos", False):
            StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks(
                M_W.shape[0], M_W.device)
            StartMasks_W, _ = self.map_transform_r_to_w(
                StartMasks_R, cam_poses, None)
            M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W)

        S_W, SM_W = self.map_accumulator_w(F_W,
                                           M_W,
                                           reset_mask=reset_mask,
                                           show="acc" if IMG_DBG else "")
        S_W_poses = g_poses
        self.prof.tick("map_accumulate")

        # If we're training Stage 2 with imitation learning from ground truth visitation distributions, we want to
        # compute observability masks with the same code that's used in Stage 1 to avoid mistakes.
        if halfway == "observability":
            map_uncoverage_w = 1 - SM_W
            return map_uncoverage_w

        # Throw away those timesteps that don't correspond to planning timesteps
        S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \
            self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan)

        #maps_m_prior_select, maps_m_posterior_select = None, None

        # Only process the maps on plannieng timesteps
        if len(S_W_select) == 0:
            return None

        self.tensor_store.keep_inputs("S_W_select", S_W_select)
        self.prof.tick("batch_select")

        # Process the map via the two map_procesors
        # Do grounding of objects in the map chosen to do so
        if self.use_aux["grounding_map"]:
            R_W_select, RS_W_poses_select = self.map_processor_grounding(
                S_W_select, sent_embeddings_select, S_W_poses_select, show="")
            self.tensor_store.keep_inputs("R_W_select", R_W_select)
            self.prof.tick("map_proc_gnd")
            # Concatenate grounding map and semantic map along channel dimension
            RS_W_select = torch.cat([S_W_select, R_W_select], 1)

        else:
            RS_W_select = S_W_select
            RS_W_poses_select = S_W_poses_select

        s_poses_select = start_poses_select if self.params[
            "predict_in_start_frame"] else cam_poses_select
        RS_S_select, RS_S_poses_select = self.map_transform_w_to_s(
            RS_W_select, RS_W_poses_select, s_poses_select)
        SM_S_select, SM_S_poses_select = self.map_transform_w_to_s(
            SM_W_select, S_W_poses_select, s_poses_select)

        assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame"

        self.tensor_store.keep_inputs("RS_S_select", RS_S_select)
        self.tensor_store.keep_inputs("SM_S_select", SM_S_select)
        self.prof.tick("transform_w_to_s")

        # Data augmentation for trajectory prediction
        map_poses_clean_select = None
        # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses
        if self.do_perturb_maps:
            assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps"
            RS_P_select, RS_P_poses_select = self.map_transform_s_to_p(
                RS_S_select, RS_S_poses_select, noisy_start_poses_select)
        else:
            RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select

        self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select)
        self.prof.tick("map_perturb")

        sent_embeddings_pp = sent_embeddings_select

        # Run lingunet on the map to predict visitation distribution scores (pre-softmax)
        # ---------
        log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet(
            RS_P_select,
            sent_embeddings_pp,
            RS_P_poses_select,
            tensor_store=self.tensor_store)
        # ---------

        self.prof.tick("pathpred")

        # TODO: Shouldn't we be transforming probability distributions instead of scores? Otherwise OOB space will have weird values
        # Transform distributions back to world reference frame and keep them (these are the model outputs)
        both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            None)
        log_v_dist_w_select = Partial2DDistribution(
            both_inner_w, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_w_select",
                                      log_v_dist_w_select)

        # Transform distributions back to start reference frame and keep them (for auxiliary objective)
        both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            start_poses_select)
        log_v_dist_s_select = Partial2DDistribution(
            both_inner_s, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_s_select",
                                      log_v_dist_s_select)

        # prime number will mean that it will alternate between sim and real
        if self.get_iter() % 23 == 0:
            lsfm = SpatialSoftmax2d()
            for i in range(S_W_select.shape[0]):
                Presenter().show_image(S_W_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_s_w_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_s_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_s_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_p_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_p_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_rs_p_select",
                                       scale=4,
                                       waitkey=1)
                break

        self.prof.tick("transform_back")

        # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from
        # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep
        # that had a trajectory prediction, rotated in the frame of timestep k.
        if select_only:
            # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps
            log_v_dist_w = log_v_dist_w_select
            v_dist_w_poses = v_dist_w_poses_select
        else:
            raise NotImplementedError("select_only must be True")

        return_list = [log_v_dist_w, v_dist_w_poses]
        if rl:
            internals_for_rl = {
                "map_coverage_w": SM_W,
                "map_uncoverage_w": 1 - SM_W
            }
            return_list.append(internals_for_rl)

        return tuple(return_list)
Example #10
0
    def forward(self,
                images,
                states,
                instructions,
                instr_lengths,
                plan=None,
                noisy_start_poses=None,
                start_poses=None,
                firstseg=None,
                select_only=True,
                halfway=False,
                grad_noise=False,
                rl=False,
                noshow=False):
        """
        :param images: BxCxHxW batch of images (observations)
        :param states: BxK batch of drone states
        :param instructions: BxM LongTensor where M is the maximum length of any instruction
        :param instr_lengths: list of len B of integers, indicating length of each instruction
        :param plan: list of B booleans indicating True for timesteps where we do planning and False otherwise
        :param noisy_start_poses: list of noisy start poses (for data-augmentation). These define the path-prediction frame at training time
        :param start_poses: list of drone start poses (these should be equal in practice)
        :param firstseg: list of booleans indicating True if a new segment starts at that timestep
        :param select_only: boolean indicating whether to only compute visitation distributions for planning timesteps (default True)
        :param rl: boolean indicating if we're doing reinforcement learning. If yes, output more than the visitation distribution
        :return:
        """
        cam_poses = self.cam_poses_from_states(states)
        g_poses = None  # None pose is a placeholder for the canonical global pose.
        self.prof.tick("out")

        # If we're running the blind baseline, simply replace the images with zeros. Keep all other computation.
        if self.params.get("blind"):
            images = torch.zeros_like(images)

        self.tensor_store.keep_inputs("fpv", images)

        # Calculate the instruction embedding
        if instructions is not None:
            # TODO: Take batch of instructions and their lengths, return batch of embeddings. Store the last one as internal state
            # TODO: handle this
            # ASSUMING IT'S THE SAME INSTRUCTION SEGMENT (PREDICT ONE SEGMENT AT A TIME).
            # UNCOMMENT THE BELOW LINE TO REVERT BACK TO GENERAL CASE OF SEPARATE INSTRUCTION PER STEP
            if self.params["ignore_instruction"]:
                # If we're ignoring instructions, just feed in an instruction that consists of a single zero-token
                sent_embeddings = self.sentence_embedding(
                    torch.zeros_like(instructions[0:1, 0:1]),
                    torch.ones_like(torch.tensor(instr_lengths[0:1])))
            else:
                sent_embeddings = self.sentence_embedding(
                    instructions[0:1], instr_lengths[0:1])
            #sent_embeddings = self.sentence_embedding(instructions, instr_lengths)
            self.tensor_store.keep_inputs("sentence_embed", sent_embeddings)
        else:
            sent_embeddings = self.sentence_embedding.get()

        self.prof.tick("embed")

        # Extract and project features onto the egocentric frame for each image
        F_W, M_W = self.img_to_features_w(images,
                                          cam_poses,
                                          sent_embeddings,
                                          self.tensor_store,
                                          show="",
                                          halfway=halfway)

        if halfway == True and not halfway == "v2":
            return None, None

        self.tensor_store.keep_inputs("F_w", F_W)
        self.tensor_store.keep_inputs("M_w", M_W)
        self.prof.tick("img_to_map_frame")

        # Accumulate the egocentric features in a global map
        reset_mask = firstseg if self.params["clear_history"] else None

        # Consider the space very near the drone as observable - it is too hard to explore to consider it unobservable.
        if self.params.get("cover_init_pos", False):
            StartMasks_R = self.add_init_pos_to_coverage.get_init_pos_masks(
                M_W.shape[0], M_W.device)
            StartMasks_W, _ = self.map_transform_r_to_w(
                StartMasks_R, cam_poses, None)
            M_W = self.add_init_pos_to_coverage(M_W, StartMasks_W)

        S_W, SM_W = self.map_accumulator_w(F_W,
                                           M_W,
                                           reset_mask=reset_mask,
                                           show="acc" if IMG_DBG else "")
        S_W_poses = g_poses
        self.tensor_store.keep_inputs("SM_w", SM_W)
        self.prof.tick("map_accumulate")

        if halfway == "observability":
            map_uncoverage_w = 1 - SM_W
            return map_uncoverage_w

        # Throw away those timesteps that don't correspond to planning timesteps
        S_W_select, SM_W_select, S_W_poses_select, cam_poses_select, noisy_start_poses_select, start_poses_select, sent_embeddings_select = \
            self.batch_select(S_W, SM_W, S_W_poses, cam_poses, noisy_start_poses, start_poses, sent_embeddings, plan)

        #maps_m_prior_select, maps_m_posterior_select = None, None

        # Only process the maps on plannieng timesteps
        if len(S_W_select) == 0:
            return None

        self.tensor_store.keep_inputs("S_W_select", S_W_select)
        self.prof.tick("batch_select")

        # Create a figure where the drone is drawn on the map
        if self.params["write_figures"]:
            self.tensor_store.keep_inputs(
                "drone_poses", Variable(draw_drone_poses(cam_poses_select)))

        # Process the map via the two map_procesors
        # Do grounding of objects in the map chosen to do so
        if self.use_aux["grounding_map"]:
            R_W_select, RS_W_poses_select = self.map_processor_grounding(
                S_W_select, sent_embeddings_select, S_W_poses_select, show="")
            self.tensor_store.keep_inputs("R_W_select", R_W_select)
            self.prof.tick("map_proc_gnd")
            # Concatenate grounding map and semantic map along channel dimension
            RS_W_select = torch.cat([S_W_select, R_W_select], 1)

        else:
            RS_W_select = S_W_select
            RS_W_poses_select = S_W_poses_select

        s_poses_select = start_poses_select if self.params[
            "predict_in_start_frame"] else cam_poses_select
        RS_S_select, RS_S_poses_select = self.map_transform_w_to_s(
            RS_W_select, RS_W_poses_select, s_poses_select)
        SM_S_select, SM_S_poses_select = self.map_transform_w_to_s(
            SM_W_select, S_W_poses_select, s_poses_select)

        assert SM_S_poses_select == RS_S_poses_select, "Masks and maps should have the same pose in start frame"

        self.tensor_store.keep_inputs("RS_S_select", RS_S_select)
        self.tensor_store.keep_inputs("SM_S_select", SM_S_select)
        self.prof.tick("transform_w_to_s")

        # Data augmentation for trajectory prediction
        map_poses_clean_select = None
        # TODO: Figure out if we can just swap out start poses for noisy poses and get rid of separate noisy poses
        if self.do_perturb_maps:
            assert noisy_start_poses_select is not None, "Noisy poses must be provided if we're perturbing maps"
            RS_P_select, RS_P_poses_select = self.map_transform_s_to_p(
                RS_S_select, RS_S_poses_select, noisy_start_poses_select)
        else:
            RS_P_select, RS_P_poses_select = RS_S_select, RS_S_poses_select

        # TODO: Figure out why this is unused!
        self.tensor_store.keep_inputs("RS_perturbed_select", RS_P_select)
        self.prof.tick("map_perturb")

        sent_embeddings_pp = sent_embeddings_select

        # Process the map via the two map_procesors (e.g. predict the trajectory that we'll be taking)
        # ---------
        log_v_dist_p_select, v_dist_p_poses_select = self.path_predictor_lingunet(
            RS_P_select,
            sent_embeddings_pp,
            RS_P_poses_select,
            tensor_store=self.tensor_store)
        # ---------

        self.prof.tick("pathpred")

        # Transform distributions back to world reference frame and keep them (these are the model outputs)
        both_inner_w, v_dist_w_poses_select = self.map_transform_p_to_w(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            None)
        log_v_dist_w_select = Partial2DDistribution(
            both_inner_w, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_w_select",
                                      log_v_dist_w_select)

        # Transform distributions back to start reference frame and keep them (for auxiliary objective)
        both_inner_s, v_dist_s_poses_select = self.map_transform_p_to_r(
            log_v_dist_p_select.inner_distribution, v_dist_p_poses_select,
            start_poses_select)
        log_v_dist_s_select = Partial2DDistribution(
            both_inner_s, log_v_dist_p_select.outer_prob_mass)
        self.tensor_store.keep_inputs("log_v_dist_s_select",
                                      log_v_dist_s_select)

        lsfm = SpatialSoftmax2d()
        # prime number will mean that it will alternate between sim and real
        if self.get_iter() % 23 == 0 and not noshow:
            for i in range(S_W_select.shape[0]):
                Presenter().show_image(S_W_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_s_w_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_s_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_s_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(lsfm(
                    log_v_dist_p_select.inner_distribution).detach().cpu()[i],
                                       f"{self.domain}_v_dist_p_select",
                                       scale=4,
                                       waitkey=1)
                Presenter().show_image(RS_P_select.detach().cpu()[i, 0:3],
                                       f"{self.domain}_rs_p_select",
                                       scale=4,
                                       waitkey=1)
                break

        self.prof.tick("transform_back")

        # If we're predicting the trajectory only on some timesteps, then for each timestep k, use the map from
        # timestep k if predicting on timestep k. otherwise use the map from timestep j - the last timestep
        # that had a trajectory prediction, rotated in the frame of timestep k.
        if select_only:
            # If we're just pre-training the trajectory prediction, don't waste time on generating the missing maps
            log_v_dist_w = log_v_dist_w_select
            v_dist_w_poses = v_dist_w_poses_select
            cam_poses = cam_poses_select
            sent_embeddings = sent_embeddings_select
        else:
            log_v_dist_w, v_dist_w_poses, log_goal_oob_score = self.map_batch_fill_missing(
                log_v_dist_w_select, v_dist_w_poses_select, plan, show="")
            # TODO: Fix this
            self.tensor_store.keep_inputs("log_both_dists_w", log_both_dists_w)
            self.prof.tick("map_fill_missing")

        return_list = [log_v_dist_w, v_dist_w_poses]
        if rl:
            internals_for_rl = {
                "map_coverage_w": SM_W,
                "map_uncoverage_w": 1 - SM_W
            }
            return_list.append(internals_for_rl)

        return tuple(return_list)