Beispiel #1
0
    def __init__(self,
                 img_in_size=256,
                 world_size_in_img=256,
                 feature_channels=32,
                 ground_channels=3,
                 embed_size=40,
                 aux_ground=False,
                 freeze=False):
        super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img)

        # Process images using a resnet to get a feature map
        self.feature_net = ResNet13Light(feature_channels, down_pad=True)

        self.aux_ground = aux_ground
        if aux_ground:
            self.lang_filter = MapLangSemanticFilter(embed_size,
                                                     feature_channels,
                                                     ground_channels)
            enable_weight_saving(self.lang_filter,
                                 "ground_filter",
                                 alwaysfreeze=freeze)

        enable_weight_saving(self.feature_net,
                             "feature_resnet_light",
                             alwaysfreeze=freeze)
Beispiel #2
0
    def __init__(self, run_name=""):

        super(ModelTrajectoryToAction, self).__init__()
        self.model_name = "lsvd_action"
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["ModelPVN"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Common
        # --------------------------------------------------------------------------------------------------------------
        self.map_transform_w_to_s = MapTransformerBase(
            source_map_size=self.params["global_map_size"],
            dest_map_size=self.params["local_map_size"],
            world_size=self.params["world_size_px"])

        self.map_transform_r_to_w = MapTransformerBase(
            source_map_size=self.params["local_map_size"],
            dest_map_size=self.params["global_map_size"],
            world_size=self.params["world_size_px"])

        # Output an action given the global semantic map
        if self.params["map_to_action"] == "downsample2":
            self.map_to_action = EgoMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"])

        elif self.params["map_to_action"] == "cropped":
            self.map_to_action = CroppedMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                manual=self.params["manual_rule"],
                path_only=self.params["action_in_path_only"],
                recurrence=self.params["action_recurrence"])

        self.spatialsoftmax = SpatialSoftmax2d()
        self.gt_fill_missing = MapBatchFillMissing(
            self.params["local_map_size"], self.params["world_size_px"])

        # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training
        # and finetuned on all timesteps end-to-end
        enable_weight_saving(self.map_to_action,
                             "map_to_action",
                             alwaysfreeze=False,
                             neverfreeze=True)

        self.action_loss = ActionLoss()

        self.env_id = None
        self.seg_idx = None
        self.prev_instruction = None
        self.seq_step = 0
        self.get_act_start_pose = None
        self.gt_labels = None
Beispiel #3
0
    def __init__(self, name, channels_in=32, map_world_size=32, *inputs):
        super(GoalAuxiliary2D, self).__init__(name, *inputs)
        self.gather_2d = Gather2D()

        self.channels_in = channels_in
        self.map_world_size = map_world_size
        self.goal_linear = nn.Linear(channels_in, 2)
        enable_weight_saving(self.goal_linear, "aux_goal_linear_" + name)
        self.loss = nn.CrossEntropyLoss()
        self.accuracy_meter = MovingAverageMeter(10)
Beispiel #4
0
    def __init__(self, name, feature_vec_len=32, num_classes=2, num_outputs=1, *inputs):
        super(ClassAuxiliary, self).__init__(name, *inputs)
        self.channels_in = feature_vec_len
        self.num_classes = num_classes
        self.num_outputs = num_outputs

        self.cls_linear = nn.Linear(feature_vec_len, num_classes * num_outputs)
        enable_weight_saving(self.cls_linear, "aux_class_linear_" + name)

        self.loss = nn.CrossEntropyLoss()
        self.meter_accuracy = MovingAverageMeter(10)
Beispiel #5
0
    def __init__(self, name, feature_vec_len=32, num_classes=64, dropout=0, *inputs):
        super(ClassAuxiliary2D, self).__init__(name, *inputs)
        self.gather_2d = Gather2D()
        self.channels_in = feature_vec_len
        self.dropout = nn.Dropout(dropout)
        self.num_classes = num_classes

        self.cls_linear = nn.Linear(feature_vec_len, num_classes)
        enable_weight_saving(self.cls_linear, "aux_class_linear_2d_" + name)
        self.loss = nn.CrossEntropyLoss()
        if self.name == "aux_grounding_map":
            self.loss.weight = torch.tensor([0.5, 0.5])
        self.meter_accuracy = MovingAverageMeter(10)
Beispiel #6
0
    def __init__(self,
                 name,
                 world_size_px=32,
                 feature_vec_len=32,
                 num_classes=64,
                 dropout=0,
                 *inputs):
        super(ClassAuxiliary2D, self).__init__(name, *inputs)
        self.gather_2d = Gather2D()
        self.channels_in = feature_vec_len
        self.dropout = nn.Dropout(dropout)
        self.num_classes = num_classes

        self.world_size_px = world_size_px
        self.cls_linear = nn.Linear(feature_vec_len, num_classes)
        enable_weight_saving(self.cls_linear, "aux_class_linear_2d_" + name)
        self.loss = nn.CrossEntropyLoss()
        self.meter_accuracy = MovingAverageMeter(10)
Beispiel #7
0
    def __init__(self, run_name="", model_class=MODEL_RSS,
                 aux_class_features=False, aux_grounding_features=False,
                 aux_class_map=False, aux_grounding_map=False, aux_goal_map=False,
                 aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False):

        super(ModelTrajectoryTopDown, self).__init__()
        self.model_name = "sm_trajectory" + str(model_class)
        self.model_class = model_class
        print("Init model of type: ", str(model_class))
        self.run_name = run_name
        self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name)

        self.params = get_current_parameters()["Model"]
        self.aux_weights = get_current_parameters()["AuxWeights"]

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        # Auxiliary Objectives
        self.use_aux_class_features = aux_class_features
        self.use_aux_grounding_features = aux_grounding_features
        self.use_aux_class_on_map = aux_class_map
        self.use_aux_grounding_on_map = aux_grounding_map
        self.use_aux_goal_on_map = aux_goal_map
        self.use_aux_lang = aux_lang
        self.use_aux_traj_on_map = aux_traj
        self.use_aux_reg_map = self.aux_weights["regularize_map"]

        self.use_rot_noise = rot_noise
        self.use_pos_noise = pos_noise


        # Path-pred FPV model definition
        # --------------------------------------------------------------------------------------------------------------

        self.img_to_features_w = FPVToGlobalMap(
            source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"],
            res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"],
            img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG)

        self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"])

        # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame
        if self.use_aux_grounding_on_map and not self.use_aux_grounding_features:
            self.map_processor_a_w = LangFilterMapProcessor(
                source_map_size=self.params["global_map_size"],
                world_size=self.params["world_size_px"],
                embed_size=self.params["emb_size"],
                in_channels=self.params["feature_channels"],
                out_channels=self.params["relevance_channels"],
                spatial=False, cat_out=True)
        else:
            self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"])

        if self.use_aux_goal_on_map:
            self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"],
                                                            world_size=self.params["world_size_px"],
                                                            embed_size=self.params["emb_size"],
                                                            in_channels=self.params["relevance_channels"],
                                                            out_channels=self.params["goal_channels"],
                                                            spatial=True, cat_out=True)
        else:
            self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"],
                                                          world_size=self.params["world_size_px"])

        pred_channels = self.params["goal_channels"] + self.params["relevance_channels"]

        # Common
        # --------------------------------------------------------------------------------------------------------------

        # Sentence Embedding
        self.sentence_embedding = SentenceEmbeddingSimple(
            self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"])

        self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"],
                                                       dest_map_size=self.params["local_map_size"],
                                                       world_size=self.params["world_size_px"])
        self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"],
                                                       dest_map_size=self.params["global_map_size"],
                                                       world_size=self.params["world_size_px"])

        # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in
        self.batch_select = MapBatchSelect()
        # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill
        # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep
        self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"])

        # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not
        enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False)

        # Output an action given the global semantic map
        if self.params["map_to_action"] == "downsample2":
            self.map_to_action = EgoMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"])

        elif self.params["map_to_action"] == "cropped":
            self.map_to_action = CroppedMapToActionTriplet(
                map_channels=self.params["map_to_act_channels"],
                map_size=self.params["local_map_size"],
                other_features_size=self.params["emb_size"]
            )

        # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training
        # and finetuned on all timesteps end-to-end
        enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True)

        # Auxiliary Objectives
        # --------------------------------------------------------------------------------------------------------------

        # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters,
        # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs
        # that have been saved with keep_auxiliary_input() during execution
        if aux_class_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_class", None,  self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "fpv_features", "lm_pos_fpv", "lm_indices"))
        if aux_grounding_features:
            self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"],
                                                "fpv_features_g", "lm_pos_fpv", "lm_mentioned"))
        if aux_class_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"],
                                                "map_s_w_select", "lm_pos_map_select", "lm_indices_select"))
        if aux_grounding_map:
            self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"],
                                                "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select"))
        if aux_goal_map:
            self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"],
                                               "map_b_w", "goal_pos_map"))
        # RSS model uses templated data for landmark and side prediction
        if self.use_aux_lang and self.params["templates"]:
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1,
                                                "sentence_embed", "lm_mentioned_tplt"))
            self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1,
                                                "sentence_embed", "side_mentioned_tplt"))
        # CoRL model uses alignment-model groundings
        elif self.use_aux_lang:
            # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned
            self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"],
                                                "sentence_embed", "lang_lm_mentioned"))
        if self.use_aux_traj_on_map:
            self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select"))

        if self.use_aux_reg_map:
            self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1",
                                                                "map_s_w_select", "lm_pos_map_select"))

        self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2)
        self.goal_acc_meter = MovingAverageMeter(10)

        self.print_auxiliary_info()

        self.action_loss = ActionLoss()

        self.env_id = None
        self.prev_instruction = None
        self.seq_step = 0
Beispiel #8
0
    def __init__(self,
                 run_name,
                 ignore_lang=False,
                 class_loss=True,
                 ground_loss=True):
        super(ModelTopDownPathGoalPredictor, self).__init__()
        self.run_name = run_name
        self.model_name = "top_down_path_pred_pretrain"
        self.writer = SummaryWriter(log_dir="runs/" + run_name)

        self.ignore_lang = ignore_lang
        self.class_loss = class_loss
        self.ground_loss = ground_loss

        # The feature net extracts the 2D feature map from the input image.
        # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image
        # The output predicted labels are the size of the feature map
        self.feature_net = ResNet13Light(32, down_pad=True)
        self.label_pool = nn.MaxPool2d(8)

        if self.ground_loss:
            self.lang_filter = MapLangSemanticFilter(sentence_embedding_size,
                                                     32, 3)
            self.aux_ground_linear = nn.Linear(3, 2)
            enable_weight_saving(self.lang_filter, "ground_filter")
            enable_weight_saving(self.aux_ground_linear, "ground_aux_linear")

        if RESNET:
            self.unet = ResNetConditional(sentence_embedding_size, 35, 2)
        else:
            unet_c_in = 35 if self.ground_loss else 32
            unet_hc1 = 48 if self.ground_loss else 48
            unet_hb1 = 24 if self.ground_loss else 24
            self.unet = Unet5ContextualBneck(unet_c_in,
                                             2,
                                             sentence_embedding_size,
                                             hc1=unet_hc1,
                                             hb1=unet_hb1,
                                             hc2=128,
                                             split_embedding=splitemb)

        if attention:
            self.sentence_embedding = SentenceEmbeddingSelfAttention(
                word_embedding_size,
                lstm_size,
                sentence_embedding_layers,
                attention_heads=attention_heads)
        else:
            self.sentence_embedding = SentenceEmbeddingSimple(
                word_embedding_size, sentence_embedding_size,
                sentence_embedding_layers)

        self.gather2d = Gather2D()

        if self.class_loss:
            self.aux_class_linear = nn.Linear(32, 64)
            enable_weight_saving(self.aux_class_linear, "class_aux_linear")

        print("Sentence Embedding #Params: ",
              get_n_params(self.sentence_embedding))
        print("U-Net #Params: ", get_n_params(self.unet))
        print("Class auxiliary: ", self.class_loss)
        print("Ground auxiliary: ", self.ground_loss)

        # Enable saving of pre-trained weights
        enable_weight_saving(self.feature_net, "feature_resnet_light")
        enable_weight_saving(self.unet, "unet")
        enable_weight_saving(self.sentence_embedding, "sentence_embedding")

        if NLL:
            #self.mask_loss = nn.BCELoss()
            self.mask_loss = nn.NLLLoss2d()
        elif BCE:
            self.mask_loss = nn.BCEWithLogitsLoss()
        elif CE:
            self.spatialsoftmax = SpatialSoftmax2d()
            self.mask_loss = CrossEntropy2d()
        else:
            self.mask_loss = nn.MSELoss()

        self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True)
        self.epoch_numbers = {"train": 0, "eval": 0}
        self.iter = nn.Parameter(torch.zeros(1), requires_grad=False)

        self.dropout = nn.Dropout(0.5)
        self.dropout2d = nn.Dropout2d(0.5)
        self.dropout3d = nn.Dropout3d(0.5)

        self.viz_images = []
        self.instructions = []