Beispiel #1
0
    def __init__(self,
                 source_map_size, world_size_px,
                 world_size, img_w, img_h,
                 embed_size, map_channels, gnd_channels, res_channels=32,
                 lang_filter=False, img_dbg=False):
        super(FPVToEgoMap, self).__init__(source_map_size, world_size_px)

        self.image_debug = img_dbg
        self.use_lang_filter = lang_filter

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels)

        if self.use_lang_filter:
            self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModule(
            source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None
Beispiel #2
0
class FPVToFPVMap(CudaModule):
    def __init__(self,
                 img_w,
                 img_h,
                 res_channels,
                 map_channels,
                 img_dbg=False):
        super(FPVToFPVMap, self).__init__()

        self.image_debug = img_dbg

        # Provide enough padding so that the map is scaled down by powers of 2.
        self.img_to_features = ImgToFeatures(res_channels, map_channels, img_w,
                                             img_h)
        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.actual_images = None

    def cuda(self, device=None):
        CudaModule.cuda(self, device)
        self.img_to_features.cuda(device)

    def init_weights(self):
        self.img_to_features.init_weights()

    def reset(self):
        self.actual_images = None

    def forward_fpv_features(self, images, sentence_embeds, parent=None):
        """
        Compute the first-person image features given the first-person images
        If grounding loss is enabled, will also return sentence_embedding conditioned image features
        :param images: images to compute features on
        :param sentence_embeds: sentence embeddings for each image
        :param parent:
        :return: features_fpv_vis - the visual features extracted using the ResNet
                 features_fpv_gnd - the grounded visual features obtained after applying a 1x1 language-conditioned conv
        """
        # Extract image features. If they've been precomputed ahead of time, just grab it by the provided index
        features_fpv_vis = self.img_to_features(images)
        return features_fpv_vis

    def forward(self, images, poses, sentence_embeds, parent=None, show=""):

        self.prof.tick("out")

        features_fpv_vis_only = self.forward_fpv_features(
            images, sentence_embeds, parent)

        return features_fpv_vis_only
Beispiel #3
0
    def __init__(self,
                 img_w,
                 img_h,
                 res_channels,
                 map_channels,
                 img_dbg=False):
        super(FPVToFPVMap, self).__init__()

        self.image_debug = img_dbg

        # Provide enough padding so that the map is scaled down by powers of 2.
        self.img_to_features = ImgToFeatures(res_channels, map_channels, img_w,
                                             img_h)
        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)
        self.actual_images = None
Beispiel #4
0
    def __init__(self,
                 source_map_size,
                 world_size_px,
                 world_size_m,
                 img_w,
                 img_h,
                 res_channels,
                 map_channels,
                 cam_h_fov,
                 domain,
                 img_dbg=False):

        super(FPVToGlobalMap, self).__init__(source_map_size, world_size_px,
                                             world_size_m)

        self.image_debug = img_dbg

        self.use_lang_filter = False

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels,
                                                 img_w, img_h)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModuleGlobal(
            source_map_size, world_size_px, world_size_m, img_w, img_h,
            cam_h_fov, domain)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None
Beispiel #5
0
class FPVToEgoMap(MapTransformerBase):
    def __init__(self,
                 source_map_size, world_size_px,
                 world_size, img_w, img_h,
                 embed_size, map_channels, gnd_channels, res_channels=32,
                 lang_filter=False, img_dbg=False):
        super(FPVToEgoMap, self).__init__(source_map_size, world_size_px)

        self.image_debug = img_dbg
        self.use_lang_filter = lang_filter

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels)

        if self.use_lang_filter:
            self.lang_filter = MapLangSemanticFilter(embed_size, map_channels, gnd_channels)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModule(
            source_map_size, world_size_px, world_size, source_map_size / 2, img_w, img_h)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.map_projection.cuda(device)
        self.grid_sampler.cuda(device)
        self.img_to_features.cuda(device)
        if self.use_lang_filter:
            self.lang_filter.cuda(device)

    def init_weights(self):
        if not self.image_debug:
            self.img_to_features.init_weights()

    def reset(self):
        self.actual_images = None
        super(FPVToEgoMap, self).reset()

    def forward_fpv_features(self, images, sentence_embeds, parent=None):
        """
        Compute the first-person image features given the first-person images
        If grounding loss is enabled, will also return sentence_embedding conditioned image features
        :param images: images to compute features on
        :param sentence_embeds: sentence embeddings for each image
        :param parent:
        :return: features_fpv_vis - the visual features extracted using the ResNet
                 features_fpv_gnd - the grounded visual features obtained after applying a 1x1 language-conditioned conv
        """
        # Extract image features. If they've been precomputed ahead of time, just grab it by the provided index
        features_fpv_vis = self.img_to_features(images)

        if parent is not None:
            parent.keep_inputs("fpv_features", features_fpv_vis)
        self.prof.tick("feat")

        # If required, pre-process image features by grounding them in language
        if self.use_lang_filter:
            self.lang_filter.precompute_conv_weights(sentence_embeds)
            features_gnd = self.lang_filter(features_fpv_vis)
            if parent is not None:
                parent.keep_inputs("fpv_features_g", features_gnd)
            self.prof.tick("gnd")
            return features_fpv_vis, features_gnd

        return features_fpv_vis, None

    def forward(self, images, poses, sentence_embeds, parent=None, show=""):

        self.prof.tick("out")

        features_fpv_vis_only, features_fpv_gnd_only = self.forward_fpv_features(images, sentence_embeds, parent)

        # If we have grounding features, the overall features are a concatenation of grounded and non-grounded features
        if features_fpv_gnd_only is not None:
            features_fpv_all = torch.cat([features_fpv_gnd_only, features_fpv_vis_only], dim=1)
        else:
            features_fpv_all = features_fpv_vis_only

        # Project first-person view features on to the map in egocentric frame
        grid_maps = self.map_projection(poses)
        self.prof.tick("proj_map")
        features_r = self.grid_sampler(features_fpv_all, grid_maps)

        # Obtain an ego-centric map mask of where we have new information
        ones_size = list(features_fpv_all.size())
        ones_size[1] = 1
        tmp_ones = empty_float_tensor(ones_size, self.is_cuda, self.cuda_device).fill_(1.0)
        new_coverages = self.grid_sampler(tmp_ones, grid_maps)

        # Make sure that new_coverage is a 0/1 mask (grid_sampler applies bilinear interpolation)
        new_coverages = new_coverages - torch.min(new_coverages)
        new_coverages = new_coverages / torch.max(new_coverages)

        self.prof.tick("gsample")

        if show != "":
            Presenter().show_image(images.data[0, 0:3], show + "_img", torch=True, scale=1, waitkey=1)
            Presenter().show_image(features_r.data[0, 0:3], show, torch=True, scale=6, waitkey=1)
            Presenter().show_image(new_coverages.data[0], show + "_covg", torch=True, scale=6, waitkey=1)

        self.prof.loop()
        self.prof.print_stats(10)

        return features_r, new_coverages
Beispiel #6
0
class FPVToGlobalMap(MapTransformerBase):
    def __init__(self,
                 source_map_size,
                 world_size_px,
                 world_size_m,
                 img_w,
                 img_h,
                 res_channels,
                 map_channels,
                 cam_h_fov,
                 domain,
                 img_dbg=False):

        super(FPVToGlobalMap, self).__init__(source_map_size, world_size_px,
                                             world_size_m)

        self.image_debug = img_dbg

        self.use_lang_filter = False

        # Process images using a resnet to get a feature map
        if self.image_debug:
            self.img_to_features = nn.MaxPool2d(8)
        else:
            # Provide enough padding so that the map is scaled down by powers of 2.
            self.img_to_features = ImgToFeatures(res_channels, map_channels,
                                                 img_w, img_h)

        # Project feature maps to the global frame
        self.map_projection = PinholeCameraProjectionModuleGlobal(
            source_map_size, world_size_px, world_size_m, img_w, img_h,
            cam_h_fov, domain)

        self.grid_sampler = GridSampler()

        self.prof = SimpleProfiler(torch_sync=PROFILE, print=PROFILE)

        self.actual_images = None

    def cuda(self, device=None):
        MapTransformerBase.cuda(self, device)
        self.map_projection.cuda(device)
        self.grid_sampler.cuda(device)
        self.img_to_features.cuda(device)
        if self.use_lang_filter:
            self.lang_filter.cuda(device)

    def init_weights(self):
        if not self.image_debug:
            self.img_to_features.init_weights()

    def reset(self):
        self.actual_images = None
        super(FPVToGlobalMap, self).reset()

    def forward_fpv_features(self, images, sentence_embeds, tensor_store=None):
        """
        Compute the first-person image features given the first-person images
        If grounding loss is enabled, will also return sentence_embedding conditioned image features
        :param images: images to compute features on
        :param sentence_embeds: sentence embeddings for each image
        :param parent:
        :return: features_fpv_vis - the visual features extracted using the ResNet
                 features_fpv_gnd - the grounded visual features obtained after applying a 1x1 language-conditioned conv
        """
        # Extract image features. If they've been precomputed ahead of time, just grab it by the provided index
        features_fpv_vis = self.img_to_features(images)

        if tensor_store is not None:
            tensor_store.keep_inputs("fpv_features", features_fpv_vis)
        #self.prof.tick("feat")

        # If required, pre-process image features by grounding them in language
        if self.use_lang_filter:
            self.lang_filter.precompute_conv_weights(sentence_embeds)
            features_gnd = self.lang_filter(features_fpv_vis)
            if tensor_store is not None:
                tensor_store.keep_inputs("fpv_features_g", features_gnd)
            #self.prof.tick("gnd")
            return features_fpv_vis, features_gnd

        return features_fpv_vis, None

    def forward(self,
                images,
                poses,
                sentence_embeds,
                tensor_store=None,
                show="",
                halfway=False):

        self.prof.tick("out")

        # self.map_projection is implemented in numpy on CPU.
        # If we give it poses on the GPU, it will transfer them to the CPU, which causes a CUDA SYNC and waits for the
        # ResNet forward pass to complete. To make use of full GPU/CPU concurrency, we move the poses to the cpu first
        poses_cpu = poses.cpu()

        features_fpv_vis_only, features_fpv_gnd_only = self.forward_fpv_features(
            images, sentence_embeds, tensor_store)

        # Halfway HAS to be True and not only truthy
        if halfway == True:
            return None, None

        # If we have grounding features, the overall features are a concatenation of grounded and non-grounded features
        if features_fpv_gnd_only is not None:
            features_fpv_all = torch.cat(
                [features_fpv_gnd_only, features_fpv_vis_only], dim=1)
        else:
            features_fpv_all = features_fpv_vis_only

        # Project first-person view features on to the map in egocentric frame
        grid_maps_cpu = self.map_projection(poses_cpu)
        grid_maps = grid_maps_cpu.to(features_fpv_all.device)

        self.prof.tick("proj_map_and_features")
        features_r = self.grid_sampler(features_fpv_all, grid_maps)

        if DEBUG_WITH_IMG:
            img_w = self.grid_sampler(images, grid_maps)
            if tensor_store is not None:
                tensor_store.keep_inputs("images_w", img_w)
            #Presenter().show_image(images.data[0], "fpv_raw", torch=True, scale=2, waitkey=1)
            #Presenter().show_image(img_w.data[0], "fpv_projected", torch=True, scale=2, waitkey=1)

        # Obtain an ego-centric map mask of where we have new information
        ones_size = list(features_fpv_all.size())
        ones_size[1] = 1
        tmp_ones = torch.ones(ones_size).to(features_r.device)
        new_coverages = self.grid_sampler(tmp_ones, grid_maps)

        # Make sure that new_coverage is a 0/1 mask (grid_sampler applies bilinear interpolation)
        new_coverages = new_coverages - torch.min(new_coverages)
        new_coverages = new_coverages / (torch.max(new_coverages) + 1e-18)

        self.prof.tick("gsample")

        if show != "":
            Presenter().show_image(images.data[0, 0:3],
                                   show + "fpv_img",
                                   torch=True,
                                   scale=2,
                                   waitkey=1)

            grid_maps_np = grid_maps.data[0].numpy()

            Presenter().show_image(grid_maps_np,
                                   show + "_grid",
                                   torch=False,
                                   scale=4,
                                   waitkey=1)
            Presenter().show_image(features_fpv_all.data[0, 0:3],
                                   show + "_preproj",
                                   torch=True,
                                   scale=8,
                                   waitkey=1)
            Presenter().show_image(images.data[0, 0:3],
                                   show + "_img",
                                   torch=True,
                                   scale=1,
                                   waitkey=1)
            Presenter().show_image(features_r.data[0, 0:3],
                                   show + "_projected",
                                   torch=True,
                                   scale=6,
                                   waitkey=1)
            Presenter().show_image(new_coverages.data[0],
                                   show + "_covg",
                                   torch=True,
                                   scale=6,
                                   waitkey=1)

        self.prof.loop()
        self.prof.print_stats(10)

        return features_r, new_coverages