def __init__(self, img_in_size=256, world_size_in_img=256, feature_channels=32, ground_channels=3, embed_size=40, aux_ground=False, freeze=False): super(TopDownToEgoMap, self).__init__(img_in_size, world_size_in_img) # Process images using a resnet to get a feature map self.feature_net = ResNet13Light(feature_channels, down_pad=True) self.aux_ground = aux_ground if aux_ground: self.lang_filter = MapLangSemanticFilter(embed_size, feature_channels, ground_channels) enable_weight_saving(self.lang_filter, "ground_filter", alwaysfreeze=freeze) enable_weight_saving(self.feature_net, "feature_resnet_light", alwaysfreeze=freeze)
def __init__(self, run_name=""): super(ModelTrajectoryToAction, self).__init__() self.model_name = "lsvd_action" self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["ModelPVN"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Common # -------------------------------------------------------------------------------------------------------------- self.map_transform_w_to_s = MapTransformerBase( source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) self.map_transform_r_to_w = MapTransformerBase( source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) # Output an action given the global semantic map if self.params["map_to_action"] == "downsample2": self.map_to_action = EgoMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"]) elif self.params["map_to_action"] == "cropped": self.map_to_action = CroppedMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], manual=self.params["manual_rule"], path_only=self.params["action_in_path_only"], recurrence=self.params["action_recurrence"]) self.spatialsoftmax = SpatialSoftmax2d() self.gt_fill_missing = MapBatchFillMissing( self.params["local_map_size"], self.params["world_size_px"]) # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training # and finetuned on all timesteps end-to-end enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True) self.action_loss = ActionLoss() self.env_id = None self.seg_idx = None self.prev_instruction = None self.seq_step = 0 self.get_act_start_pose = None self.gt_labels = None
def __init__(self, name, channels_in=32, map_world_size=32, *inputs): super(GoalAuxiliary2D, self).__init__(name, *inputs) self.gather_2d = Gather2D() self.channels_in = channels_in self.map_world_size = map_world_size self.goal_linear = nn.Linear(channels_in, 2) enable_weight_saving(self.goal_linear, "aux_goal_linear_" + name) self.loss = nn.CrossEntropyLoss() self.accuracy_meter = MovingAverageMeter(10)
def __init__(self, name, feature_vec_len=32, num_classes=2, num_outputs=1, *inputs): super(ClassAuxiliary, self).__init__(name, *inputs) self.channels_in = feature_vec_len self.num_classes = num_classes self.num_outputs = num_outputs self.cls_linear = nn.Linear(feature_vec_len, num_classes * num_outputs) enable_weight_saving(self.cls_linear, "aux_class_linear_" + name) self.loss = nn.CrossEntropyLoss() self.meter_accuracy = MovingAverageMeter(10)
def __init__(self, name, feature_vec_len=32, num_classes=64, dropout=0, *inputs): super(ClassAuxiliary2D, self).__init__(name, *inputs) self.gather_2d = Gather2D() self.channels_in = feature_vec_len self.dropout = nn.Dropout(dropout) self.num_classes = num_classes self.cls_linear = nn.Linear(feature_vec_len, num_classes) enable_weight_saving(self.cls_linear, "aux_class_linear_2d_" + name) self.loss = nn.CrossEntropyLoss() if self.name == "aux_grounding_map": self.loss.weight = torch.tensor([0.5, 0.5]) self.meter_accuracy = MovingAverageMeter(10)
def __init__(self, name, world_size_px=32, feature_vec_len=32, num_classes=64, dropout=0, *inputs): super(ClassAuxiliary2D, self).__init__(name, *inputs) self.gather_2d = Gather2D() self.channels_in = feature_vec_len self.dropout = nn.Dropout(dropout) self.num_classes = num_classes self.world_size_px = world_size_px self.cls_linear = nn.Linear(feature_vec_len, num_classes) enable_weight_saving(self.cls_linear, "aux_class_linear_2d_" + name) self.loss = nn.CrossEntropyLoss() self.meter_accuracy = MovingAverageMeter(10)
def __init__(self, run_name="", model_class=MODEL_RSS, aux_class_features=False, aux_grounding_features=False, aux_class_map=False, aux_grounding_map=False, aux_goal_map=False, aux_lang=False, aux_traj=False, rot_noise=False, pos_noise=False): super(ModelTrajectoryTopDown, self).__init__() self.model_name = "sm_trajectory" + str(model_class) self.model_class = model_class print("Init model of type: ", str(model_class)) self.run_name = run_name self.writer = LoggingSummaryWriter(log_dir="runs/" + run_name) self.params = get_current_parameters()["Model"] self.aux_weights = get_current_parameters()["AuxWeights"] self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE) self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) # Auxiliary Objectives self.use_aux_class_features = aux_class_features self.use_aux_grounding_features = aux_grounding_features self.use_aux_class_on_map = aux_class_map self.use_aux_grounding_on_map = aux_grounding_map self.use_aux_goal_on_map = aux_goal_map self.use_aux_lang = aux_lang self.use_aux_traj_on_map = aux_traj self.use_aux_reg_map = self.aux_weights["regularize_map"] self.use_rot_noise = rot_noise self.use_pos_noise = pos_noise # Path-pred FPV model definition # -------------------------------------------------------------------------------------------------------------- self.img_to_features_w = FPVToGlobalMap( source_map_size=self.params["global_map_size"], world_size_px=self.params["world_size_px"], world_size=self.params["world_size_m"], res_channels=self.params["resnet_channels"], map_channels=self.params["feature_channels"], img_w=self.params["img_w"], img_h=self.params["img_h"], img_dbg=IMG_DBG) self.map_accumulator_w = LeakyIntegratorGlobalMap(source_map_size=self.params["global_map_size"], world_in_map_size=self.params["world_size_px"]) # Pre-process the accumulated map to do language grounding if necessary - in the world reference frame if self.use_aux_grounding_on_map and not self.use_aux_grounding_features: self.map_processor_a_w = LangFilterMapProcessor( source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["feature_channels"], out_channels=self.params["relevance_channels"], spatial=False, cat_out=True) else: self.map_processor_a_w = IdentityMapProcessor(source_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) if self.use_aux_goal_on_map: self.map_processor_b_r = LangFilterMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"], embed_size=self.params["emb_size"], in_channels=self.params["relevance_channels"], out_channels=self.params["goal_channels"], spatial=True, cat_out=True) else: self.map_processor_b_r = IdentityMapProcessor(source_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) pred_channels = self.params["goal_channels"] + self.params["relevance_channels"] # Common # -------------------------------------------------------------------------------------------------------------- # Sentence Embedding self.sentence_embedding = SentenceEmbeddingSimple( self.params["word_emb_size"], self.params["emb_size"], self.params["emb_layers"]) self.map_transform_w_to_r = MapTransformerBase(source_map_size=self.params["global_map_size"], dest_map_size=self.params["local_map_size"], world_size=self.params["world_size_px"]) self.map_transform_r_to_w = MapTransformerBase(source_map_size=self.params["local_map_size"], dest_map_size=self.params["global_map_size"], world_size=self.params["world_size_px"]) # Batch select is used to drop and forget semantic maps at those timestaps that we're not planning in self.batch_select = MapBatchSelect() # Since we only have path predictions for some timesteps (the ones not dropped above), we use this to fill # in the missing pieces by reorienting the past trajectory prediction into the frame of the current timestep self.map_batch_fill_missing = MapBatchFillMissing(self.params["local_map_size"], self.params["world_size_px"]) # Passing true to freeze will freeze these weights regardless of whether they've been explicitly reloaded or not enable_weight_saving(self.sentence_embedding, "sentence_embedding", alwaysfreeze=False) # Output an action given the global semantic map if self.params["map_to_action"] == "downsample2": self.map_to_action = EgoMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"]) elif self.params["map_to_action"] == "cropped": self.map_to_action = CroppedMapToActionTriplet( map_channels=self.params["map_to_act_channels"], map_size=self.params["local_map_size"], other_features_size=self.params["emb_size"] ) # Don't freeze the trajectory to action weights, because it will be pre-trained during path-prediction training # and finetuned on all timesteps end-to-end enable_weight_saving(self.map_to_action, "map_to_action", alwaysfreeze=False, neverfreeze=True) # Auxiliary Objectives # -------------------------------------------------------------------------------------------------------------- # We add all auxiliaries that are necessary. The first argument is the auxiliary name, followed by parameters, # followed by variable number of names of inputs. ModuleWithAuxiliaries will automatically collect these inputs # that have been saved with keep_auxiliary_input() during execution if aux_class_features: self.add_auxiliary(ClassAuxiliary2D("aux_class", None, self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "fpv_features", "lm_pos_fpv", "lm_indices")) if aux_grounding_features: self.add_auxiliary(ClassAuxiliary2D("aux_ground", None, self.params["relevance_channels"], 2, self.params["dropout"], "fpv_features_g", "lm_pos_fpv", "lm_mentioned")) if aux_class_map: self.add_auxiliary(ClassAuxiliary2D("aux_class_map", self.params["world_size_px"], self.params["feature_channels"], self.params["num_landmarks"], self.params["dropout"], "map_s_w_select", "lm_pos_map_select", "lm_indices_select")) if aux_grounding_map: self.add_auxiliary(ClassAuxiliary2D("aux_grounding_map", self.params["world_size_px"], self.params["relevance_channels"], 2, self.params["dropout"], "map_a_w_select", "lm_pos_map_select", "lm_mentioned_select")) if aux_goal_map: self.add_auxiliary(GoalAuxiliary2D("aux_goal_map", self.params["goal_channels"], self.params["world_size_px"], "map_b_w", "goal_pos_map")) # RSS model uses templated data for landmark and side prediction if self.use_aux_lang and self.params["templates"]: self.add_auxiliary(ClassAuxiliary("aux_lang_lm", self.params["emb_size"], self.params["num_landmarks"], 1, "sentence_embed", "lm_mentioned_tplt")) self.add_auxiliary(ClassAuxiliary("aux_lang_side", self.params["emb_size"], self.params["num_sides"], 1, "sentence_embed", "side_mentioned_tplt")) # CoRL model uses alignment-model groundings elif self.use_aux_lang: # one output for each landmark, 2 classes per output. This is for finetuning, so use the embedding that's gonna be fine tuned self.add_auxiliary(ClassAuxiliary("aux_lang_lm_nl", self.params["emb_size"], 2, self.params["num_landmarks"], "sentence_embed", "lang_lm_mentioned")) if self.use_aux_traj_on_map: self.add_auxiliary(PathAuxiliary2D("aux_path", "map_b_r_select", "traj_gt_r_select")) if self.use_aux_reg_map: self.add_auxiliary(FeatureRegularizationAuxiliary2D("aux_regularize_features", None, "l1", "map_s_w_select", "lm_pos_map_select")) self.goal_good_criterion = GoalPredictionGoodCriterion(ok_distance=3.2) self.goal_acc_meter = MovingAverageMeter(10) self.print_auxiliary_info() self.action_loss = ActionLoss() self.env_id = None self.prev_instruction = None self.seq_step = 0
def __init__(self, run_name, ignore_lang=False, class_loss=True, ground_loss=True): super(ModelTopDownPathGoalPredictor, self).__init__() self.run_name = run_name self.model_name = "top_down_path_pred_pretrain" self.writer = SummaryWriter(log_dir="runs/" + run_name) self.ignore_lang = ignore_lang self.class_loss = class_loss self.ground_loss = ground_loss # The feature net extracts the 2D feature map from the input image. # The label_pool down-sizes the ground-truth labels, which are input at the same size as the input image # The output predicted labels are the size of the feature map self.feature_net = ResNet13Light(32, down_pad=True) self.label_pool = nn.MaxPool2d(8) if self.ground_loss: self.lang_filter = MapLangSemanticFilter(sentence_embedding_size, 32, 3) self.aux_ground_linear = nn.Linear(3, 2) enable_weight_saving(self.lang_filter, "ground_filter") enable_weight_saving(self.aux_ground_linear, "ground_aux_linear") if RESNET: self.unet = ResNetConditional(sentence_embedding_size, 35, 2) else: unet_c_in = 35 if self.ground_loss else 32 unet_hc1 = 48 if self.ground_loss else 48 unet_hb1 = 24 if self.ground_loss else 24 self.unet = Unet5ContextualBneck(unet_c_in, 2, sentence_embedding_size, hc1=unet_hc1, hb1=unet_hb1, hc2=128, split_embedding=splitemb) if attention: self.sentence_embedding = SentenceEmbeddingSelfAttention( word_embedding_size, lstm_size, sentence_embedding_layers, attention_heads=attention_heads) else: self.sentence_embedding = SentenceEmbeddingSimple( word_embedding_size, sentence_embedding_size, sentence_embedding_layers) self.gather2d = Gather2D() if self.class_loss: self.aux_class_linear = nn.Linear(32, 64) enable_weight_saving(self.aux_class_linear, "class_aux_linear") print("Sentence Embedding #Params: ", get_n_params(self.sentence_embedding)) print("U-Net #Params: ", get_n_params(self.unet)) print("Class auxiliary: ", self.class_loss) print("Ground auxiliary: ", self.ground_loss) # Enable saving of pre-trained weights enable_weight_saving(self.feature_net, "feature_resnet_light") enable_weight_saving(self.unet, "unet") enable_weight_saving(self.sentence_embedding, "sentence_embedding") if NLL: #self.mask_loss = nn.BCELoss() self.mask_loss = nn.NLLLoss2d() elif BCE: self.mask_loss = nn.BCEWithLogitsLoss() elif CE: self.spatialsoftmax = SpatialSoftmax2d() self.mask_loss = CrossEntropy2d() else: self.mask_loss = nn.MSELoss() self.aux_loss = nn.CrossEntropyLoss(reduce=True, size_average=True) self.epoch_numbers = {"train": 0, "eval": 0} self.iter = nn.Parameter(torch.zeros(1), requires_grad=False) self.dropout = nn.Dropout(0.5) self.dropout2d = nn.Dropout2d(0.5) self.dropout3d = nn.Dropout3d(0.5) self.viz_images = [] self.instructions = []