Exemple #1
0
    def __init__(self, observation_space, hidden_size):
        super().__init__()

        if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid
                in observation_space.spaces):
            self._n_input_goal = observation_space.spaces[
                IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0]
        elif PointGoalSensor.cls_uuid in observation_space.spaces:
            self._n_input_goal = observation_space.spaces[
                PointGoalSensor.cls_uuid].shape[0]
        elif ImageGoalSensor.cls_uuid in observation_space.spaces:
            goal_observation_space = spaces.Dict(
                {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]})
            self.goal_visual_encoder = SimpleCNN(goal_observation_space,
                                                 hidden_size)
            self._n_input_goal = hidden_size

        self._hidden_size = hidden_size

        self.visual_encoder = SimpleCNN(observation_space, hidden_size)

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + self._n_input_goal,
            self._hidden_size,
        )

        self.train()
    def __init__(
            self,
            observation_space,
            action_space,
            goal_sensor_uuid,
            hidden_size,
            num_recurrent_layers,
            rnn_type,
            backbone,
            resnet_baseplanes,
            normalize_visual_inputs,
            obs_transform=ResizeCenterCropper(size=(256, 256)),
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        self._n_prev_action = 32

        self._n_input_goal = (
            observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1)
        self.tgt_embeding = nn.Linear(self._n_input_goal, 32)
        self._n_input_goal = 32

        self._hidden_size = hidden_size

        rnn_input_size = self._n_input_goal + self._n_prev_action
        self.visual_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
            obs_transform=obs_transform,
        )

        if not self.visual_encoder.is_blind:
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
    def __init__(self, observation_space: Space, model_config: Config,
                 num_actions):
        super().__init__()
        self.model_config = model_config

        # Init the depth encoder
        assert model_config.DEPTH_ENCODER.cnn_type in model_config.DEPTH_ENCODER.supported_encoders, \
            f"DEPTH_ENCODER.cnn_type must be in {model_config.DEPTH_ENCODER.supported_encoders}"

        if model_config.DEPTH_ENCODER.cnn_type == "DepthEncoderResnet50":
            self.depth_encoder = DepthEncoderResnet50(
                observation_space,
                output_size=model_config.DEPTH_ENCODER.output_size,
                checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
                backbone=model_config.DEPTH_ENCODER.backbone,
            )

        # Init the RGB visual encoder
        assert model_config.RGB_ENCODER.cnn_type in model_config.RGB_ENCODER.supported_encoders, \
            f"RGB_ENCODER.cnn_type must be in {model_config.RGB_ENCODER.supported_encoders}"

        if model_config.RGB_ENCODER.cnn_type == "RGBEncoderResnet50":
            device = (torch.device("cuda", model_config.TORCH_GPU_ID)
                      if torch.cuda.is_available() else torch.device("cpu"))
            self.rgb_encoder = RGBEncoderResnet50(
                observation_space, model_config.RGB_ENCODER.output_size,
                device)

        if model_config.SEQ2SEQ.use_prev_action:
            self.prev_action_embedding = nn.Embedding(num_actions + 1, 32)

        # Init the RNN state decoder
        rnn_input_size = (model_config.DEPTH_ENCODER.output_size +
                          model_config.RGB_ENCODER.output_size)

        if model_config.SEQ2SEQ.use_pointgoal:
            rnn_input_size += (observation_space.
                               spaces["pointgoal_with_gps_compass"].shape[0])
        if model_config.SEQ2SEQ.use_heading:
            rnn_input_size += (observation_space.spaces["heading"].shape[0])

        if model_config.SEQ2SEQ.use_prev_action:
            rnn_input_size += self.prev_action_embedding.embedding_dim

        self.state_encoder = RNNStateEncoder(
            input_size=rnn_input_size,
            hidden_size=model_config.STATE_ENCODER.hidden_size,
            num_layers=1,
            rnn_type=model_config.STATE_ENCODER.rnn_type,
        )

        self.train()
    def __init__(self, observation_space, hidden_size, goal_sensor_uuid):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0]
        self._hidden_size = hidden_size

        self.visual_encoder = SimpleCNN(observation_space, hidden_size)

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + self._n_input_goal,
            self._hidden_size,
        )

        self.train()
    def __init__(self,
                 cfg,
                 observation_space,
                 hidden_size,
                 goal_sensor_uuid,
                 with_target_encoding,
                 device,
                 visual_encoder="SimpleCNN",
                 drop_prob=0.5,
                 channel_scale=1):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.with_target_encoding = with_target_encoding
        num_recurrent_layers = getattr(cfg, "num_recurrent_layers", 1)
        rnn_type = getattr(cfg, "rnn_type", "GRU")

        self._n_input_goal = observation_space.spaces[
            self.goal_sensor_uuid].shape[0]
        self._hidden_size = hidden_size

        self.visual_encoder = VISUAL_ENCODER_MODELS[visual_encoder](
            observation_space,
            hidden_size,
            drop_prob=drop_prob,
            channel_scale=channel_scale)

        visual_feat_size = 0 if self.is_blind else self._hidden_size
        rnn_out_size = self._hidden_size
        t_enc_size = self._n_input_goal if with_target_encoding else 0

        self.aux_models = aux_models = torch.nn.ModuleDict({})
        for aux_type in cfg.aux:
            aux_cfg = getattr(cfg, AUX_CLASSES[aux_type].__name__)
            aux_models[aux_type] = AUX_CLASSES[aux_type](
                aux_cfg,
                visual_feat_size,
                t_enc_size,
                rnn_out_size,
                observation_space=observation_space)

        print(rnn_type, num_recurrent_layers)
        self.state_encoder = RNNStateEncoder(visual_feat_size + t_enc_size,
                                             self._hidden_size,
                                             num_layers=num_recurrent_layers,
                                             rnn_type=rnn_type)

        self.train()
Exemple #6
0
    def __init__(
        self,
        observation_space,
        action_space,
        goal_sensor_uuid,
        hidden_size,
        num_recurrent_layers,
        rnn_type,
        backbone,
        normalize_visual_inputs,
        pretrained=False,
        finetune=False,
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        self._n_prev_action = 32

        self._n_input_goal = (
            observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1)
        self.tgt_embeding = nn.Linear(self._n_input_goal, 32)
        self._n_input_goal = 32

        self._hidden_size = hidden_size

        rnn_input_size = self._n_input_goal + self._n_prev_action
        self.visual_encoder = EfficientNetEncoder(
            observation_space,
            hidden_size=hidden_size,
            backbone_name=backbone,
            pretrained=pretrained,
            finetune=finetune,
            normalize_visual_inputs=normalize_visual_inputs,
        )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
    def __init__(self, observation_space, hidden_size, goal_sensor_uuid,
                 detector_config, device):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self._n_input_goal = observation_space.spaces[
            self.goal_sensor_uuid
        ].shape[0]
        self._hidden_size = hidden_size

        self.detector = detector = YoloDetector(detector_config, device)
        self.visual_encoder = AimasCNN(observation_space, hidden_size,
                                       detector)

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + self._n_input_goal,
            self._hidden_size,
        )

        self.train()
Exemple #8
0
    def __init__(
        self,
        observation_space,
        hidden_size,
        goal_sensor_uuid=None,
        additional_sensors=[
        ]  # low dim sensors corresponding to registered name
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.additional_sensors = additional_sensors
        self._n_input_goal = 0
        self._n_input_goal = 0
        if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor":
            self.goal_sensor_uuid = goal_sensor_uuid
            self._initialize_goal_encoder(observation_space)
        self._hidden_size = hidden_size

        resnet_baseplanes = 32
        backbone = "resnet18"
        visual_resnet = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
        )
        self.visual_encoder = nn.Sequential(
            visual_resnet,
            Flatten(),
            nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
            nn.ReLU(True),
        )

        final_embedding_size = (0 if self.is_blind else
                                self._hidden_size) + self._n_input_goal
        for sensor in additional_sensors:
            final_embedding_size += observation_space.spaces[sensor].shape[0]

        self.state_encoder = RNNStateEncoder(final_embedding_size,
                                             self._hidden_size)
        self.train()
    def __init__(self, observation_space, hidden_size, goal_sensor_uuid,
                 with_target_encoding, device, visual_encoder="SimpleCNN",
                 drop_prob=0.5, channel_scale=1):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.with_target_encoding = with_target_encoding

        self._n_input_goal = observation_space.spaces[
            self.goal_sensor_uuid
        ].shape[0]
        self._hidden_size = hidden_size

        self.visual_encoder = VISUAL_ENCODER_MODELS[visual_encoder](
            observation_space, hidden_size, drop_prob=drop_prob,
            channel_scale=channel_scale)

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) +
            (self._n_input_goal if with_target_encoding else 0),
            self._hidden_size,
        )

        self.train()
Exemple #10
0
    def __init__(self, observation_space: Space, num_actions: int,
                 num_sub_tasks: int, model_config: Config, batch_size: int):
        super().__init__()
        self.model_config = model_config
        self.batch_size = batch_size
        self.num_sub_tasks = num_sub_tasks
        device = (torch.device("cuda", model_config.TORCH_GPU_ID)
                  if torch.cuda.is_available() else torch.device("cpu"))

        # Init the instruction encoder

        if model_config.INSTRUCTION_ENCODER.is_bert:
            self.instruction_encoder = LanguageEncoder(
                model_config.INSTRUCTION_ENCODER, device)
        else:
            self.instruction_encoder = InstructionEncoder(
                model_config.INSTRUCTION_ENCODER)
        # Init the depth encoder
        assert model_config.DEPTH_ENCODER.cnn_type in [
            "SimpleDepthCNN",
            "VlnResnetDepthEncoder",
        ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder"
        if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN":
            self.depth_encoder = SimpleDepthCNN(
                observation_space, model_config.DEPTH_ENCODER.output_size)
        elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder":
            self.depth_encoder = VlnResnetDepthEncoder(
                observation_space,
                output_size=model_config.DEPTH_ENCODER.output_size,
                checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
                backbone=model_config.DEPTH_ENCODER.backbone,
            )

        # Init the RGB visual encoder
        assert model_config.RGB_ENCODER.cnn_type in [
            "SimpleRGBCNN",
            "TorchVisionResNet50",
        ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'."

        if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN":
            self.rgb_encoder = SimpleRGBCNN(
                observation_space, model_config.RGB_ENCODER.output_size)
        elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50":
            self.rgb_encoder = TorchVisionResNet50(
                observation_space, model_config.RGB_ENCODER.output_size,
                model_config.RGB_ENCODER.resnet_output_size, device)

        if model_config.SEQ2SEQ.use_prev_action:
            self.prev_action_embedding = nn.Embedding(num_actions + 1, 32)

        # Init the RNN state decoder
        rnn_input_size = (self.instruction_encoder.output_size +
                          model_config.DEPTH_ENCODER.output_size +
                          model_config.RGB_ENCODER.output_size)

        if model_config.SEQ2SEQ.use_prev_action:
            rnn_input_size += self.prev_action_embedding.embedding_dim

        self.state_encoder = RNNStateEncoder(
            input_size=rnn_input_size,
            hidden_size=model_config.STATE_ENCODER.hidden_size,
            num_layers=1,
            rnn_type=model_config.STATE_ENCODER.rnn_type,
        )

        self.progress_monitor = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, 1)
        self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size,
                                num_actions)
        self.sub_goal_linear = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, self.num_sub_tasks)
        self.stop_linear = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, 1)

        self._init_layers()
Exemple #11
0
    def __init__(
        self,
        observation_space,
        action_space,
        goal_sensor_uuid,
        hidden_size,
        num_recurrent_layers,
        rnn_type,
        backbone,
        resnet_baseplanes,
        normalize_visual_inputs,
        use_info_bot,
        use_odometry,
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self._hidden_size = hidden_size

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, hidden_size)
        self._n_prev_action = self.prev_action_embedding.embedding_dim

        self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0]
        self._tgt_proj = nn.Linear(self._n_input_goal, hidden_size)
        self._n_input_goal = 32

        self.ib = True
        self.use_info_bot = use_info_bot
        self.use_odometry = use_odometry

        if self.ib:
            self.bottleneck = VIBCompleteLayer(self._hidden_size, self._n_input_goal, self.use_info_bot, self.use_odometry)

        self.visual_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
        )

        if not self.visual_encoder.is_blind:
            after_compression_flat_size = 2048
            num_compression_channels = int(
                round(
                    after_compression_flat_size
                    / (
                        self.visual_encoder.output_shape[1]
                        * self.visual_encoder.output_shape[2]
                    )
                )
            )
            self.compression = nn.Sequential(
                resnet.BasicBlock(
                    self.visual_encoder.output_shape[0],
                    self.visual_encoder.output_shape[0],
                    1,
                ),
                resnet.BasicBlock(
                    self.visual_encoder.output_shape[0],
                    num_compression_channels,
                    1,
                    downsample=nn.Conv2d(
                        self.visual_encoder.output_shape[0], num_compression_channels, 1
                    ),
                ),
            )

            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(
                    np.prod(self.visual_encoder.compression_shape),
                    self._hidden_size - self._hidden_size // 4,
                    bias=False,
                ),
                nn.LayerNorm(self._hidden_size - self._hidden_size // 4),
                nn.ReLU(True),
            )

            self.visual_flow_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(
                    np.prod(self.visual_encoder.compression_shape),
                    self._hidden_size // 2,
                    bias=False,
                ),
                nn.LayerNorm(self._hidden_size // 2),
                nn.ReLU(True),
                nn.Linear(self._hidden_size // 2, self._hidden_size // 4, bias=False),
                nn.LayerNorm(self._hidden_size // 4),
                nn.ReLU(True),
            )

            self.delta_egomotion_predictor = nn.Linear(self._hidden_size // 4, 3)

        if rnn_type != "transformer":
            self.state_encoder = RNNStateEncoder(
                self._hidden_size,
                self._hidden_size,
                rnn_type=rnn_type,
                num_layers=num_recurrent_layers,
            )
        else:
            self.state_encoder = TransformerStateEncoder(
                input_size=self._hidden_size, d_model=self._hidden_size
            )

        self.goal_mem_layer = nn.Sequential(
            nn.Linear(
                self._hidden_size + (self._n_input_goal if self.ib else 0),
                self.output_size,
            ),
            nn.ReLU(True),
        )

        self.pg_with_gps_pred = nn.Sequential(
            nn.Linear(self._hidden_size, self._hidden_size // 2),
            nn.ReLU(True),
            nn.Linear(self._hidden_size // 2, 3),
        )

        self.train()

        self.register_buffer("ego_error_threshold", torch.tensor([[0.01]]))
Exemple #12
0
    def __init__(self, observation_space: Space, num_actions: int,
                 model_config: Config):
        super().__init__()
        self.model_config = model_config
        model_config.defrost()
        model_config.INSTRUCTION_ENCODER.final_state_only = False
        model_config.freeze()

        # Init the instruction encoder
        self.instruction_encoder = InstructionEncoder(
            model_config.INSTRUCTION_ENCODER)

        # Init the depth encoder
        assert model_config.DEPTH_ENCODER.cnn_type in [
            "VlnResnetDepthEncoder"
        ], "DEPTH_ENCODER.cnn_type must be VlnResnetDepthEncoder"
        self.depth_encoder = VlnResnetDepthEncoder(
            observation_space,
            output_size=model_config.DEPTH_ENCODER.output_size,
            checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
            backbone=model_config.DEPTH_ENCODER.backbone,
            spatial_output=True,
        )

        # Init the RGB encoder
        assert model_config.RGB_ENCODER.cnn_type in [
            "TorchVisionResNet50"
        ], "RGB_ENCODER.cnn_type must be TorchVisionResNet50'."

        device = (torch.device("cuda", model_config.TORCH_GPU_ID)
                  if torch.cuda.is_available() else torch.device("cpu"))
        self.rgb_encoder = TorchVisionResNet50(
            observation_space,
            model_config.RGB_ENCODER.output_size,
            model_config.RGB_ENCODER.resnet_output_size,
            device,
            spatial_output=True,
        )

        if model_config.CMA.use_prev_action:
            self.prev_action_embedding = nn.Embedding(num_actions + 1, 32)

        self.rcm_state_encoder = model_config.CMA.rcm_state_encoder

        hidden_size = model_config.STATE_ENCODER.hidden_size
        self._hidden_size = hidden_size

        if self.rcm_state_encoder:
            self.state_encoder = RCMStateEncoder(
                self.rgb_encoder.output_shape[0],
                self.depth_encoder.output_shape[0],
                model_config.STATE_ENCODER.hidden_size,
                self.prev_action_embedding.embedding_dim,
            )
        else:
            self.rgb_linear = nn.Sequential(
                nn.AdaptiveAvgPool1d(1),
                nn.Flatten(),
                nn.Linear(
                    self.rgb_encoder.output_shape[0],
                    model_config.RGB_ENCODER.output_size,
                ),
                nn.ReLU(True),
            )
            self.depth_linear = nn.Sequential(
                nn.Flatten(),
                nn.Linear(
                    np.prod(self.depth_encoder.output_shape),
                    model_config.DEPTH_ENCODER.output_size,
                ),
                nn.ReLU(True),
            )

            # Init the RNN state decoder
            rnn_input_size = model_config.DEPTH_ENCODER.output_size
            rnn_input_size += model_config.RGB_ENCODER.output_size
            if model_config.CMA.use_prev_action:
                rnn_input_size += self.prev_action_embedding.embedding_dim

            self.state_encoder = RNNStateEncoder(
                input_size=rnn_input_size,
                hidden_size=model_config.STATE_ENCODER.hidden_size,
                num_layers=1,
                rnn_type=model_config.STATE_ENCODER.rnn_type,
            )

        self._output_size = (model_config.STATE_ENCODER.hidden_size +
                             model_config.RGB_ENCODER.output_size +
                             model_config.DEPTH_ENCODER.output_size +
                             self.instruction_encoder.output_size)

        self.rgb_kv = nn.Conv1d(
            self.rgb_encoder.output_shape[0],
            hidden_size // 2 + model_config.RGB_ENCODER.output_size,
            1,
        )

        self.depth_kv = nn.Conv1d(
            self.depth_encoder.output_shape[0],
            hidden_size // 2 + model_config.DEPTH_ENCODER.output_size,
            1,
        )

        # self.depth_kv = nn.Conv1d(
        #     self.depth_encoder.output_shape[0],
        #     hidden_size,
        #     1,
        # )

        self.state_q = nn.Linear(hidden_size, hidden_size // 2)
        self.text_k = nn.Conv1d(self.instruction_encoder.output_size,
                                hidden_size // 2, 1)
        self.text_q = nn.Linear(self.instruction_encoder.output_size,
                                hidden_size // 2)

        self.register_buffer("_scale",
                             torch.tensor(1.0 / ((hidden_size // 2)**0.5)))

        if model_config.CMA.use_prev_action:
            self.second_state_compress = nn.Sequential(
                nn.Linear(
                    self._output_size +
                    self.prev_action_embedding.embedding_dim,
                    self._hidden_size,
                ),
                nn.ReLU(True),
            )
        else:
            self.second_state_compress = nn.Sequential(
                nn.Linear(
                    self._output_size,
                    self._hidden_size,
                ),
                nn.ReLU(True),
            )

        self.second_state_encoder = RNNStateEncoder(
            input_size=self._hidden_size,
            hidden_size=self._hidden_size,
            num_layers=1,
            rnn_type=model_config.STATE_ENCODER.rnn_type,
        )
        self._output_size = model_config.STATE_ENCODER.hidden_size

        self.progress_monitor = nn.Linear(self.output_size, 1)

        self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size,
                                num_actions)
        self.stop_linear = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, 1)

        self._init_layers()

        self.train()
Exemple #13
0
 def _initialize_state_encoder(self):
     self.state_encoder = RNNStateEncoder(self._embedding_size, self._hidden_size)
Exemple #14
0
 def _initialize_state_encoder(self):
     self.state_encoders = nn.ModuleList([
         RNNStateEncoder(self._embedding_size, self._hidden_size) for _ in range(self.num_tasks)
     ])
Exemple #15
0
    def __init__(
        self,
        observation_space,
        hidden_size,
        goal_sensor_uuid=None,
        detach=False,
        imagenet=False,
        additional_sensors=[
        ]  # low dim sensors corresponding to registered name
    ):
        self.detach = detach
        self.imagenet = imagenet
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.additional_sensors = additional_sensors
        self._n_input_goal = 0
        self._n_input_goal = 0
        if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor":
            self.goal_sensor_uuid = goal_sensor_uuid
            self._initialize_goal_encoder(observation_space)
        self._hidden_size = hidden_size

        resnet_baseplanes = 64
        backbone = "resnet50"
        # backbone="resnet18"

        if imagenet:
            visual_resnet = TorchVisionResNet50()
            visual_resnet.eval()
        else:
            visual_resnet = ResNetEncoder(
                observation_space,
                baseplanes=resnet_baseplanes,
                ngroups=resnet_baseplanes // 2,
                make_backbone=getattr(resnet, backbone),
                normalize_visual_inputs=False,
            )

        self.detach = detach

        self.model_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
            dense=True,
        )

        self.target_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
            dense=True,
        )

        self.visual_resnet = visual_resnet

        if imagenet:
            self.visual_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(2048, hidden_size),
                nn.ReLU(True),
            )

            self.target_image_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(2048, hidden_size),
                nn.ReLU(True),
            )
        else:
            self.visual_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
                nn.ReLU(True),
            )

            self.target_image_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
                nn.ReLU(True),
            )

        final_embedding_size = (0 if self.is_blind else
                                self._hidden_size) + self._n_input_goal
        for sensor in additional_sensors:
            final_embedding_size += observation_space.spaces[sensor].shape[0]

        if self.goal_sensor_uuid == 'imagegoal':
            final_embedding_size = 1024

        self.state_encoder = nn.Sequential(
            nn.Linear(final_embedding_size, hidden_size), nn.ReLU(True),
            nn.Linear(hidden_size, hidden_size))
        self.state_policy_encoder = RNNStateEncoder(final_embedding_size,
                                                    self._hidden_size)
        self.train()
Exemple #16
0
    def __init__(self, observation_space: Space, model_config: Config, num_actions):
        super().__init__()
        self.model_config = model_config

        # Init the instruction encoder
        self.instruction_encoder = InstructionEncoder2(model_config.INSTRUCTION_ENCODER)

        # Init the depth encoder
        assert model_config.DEPTH_ENCODER.cnn_type in [
            "SimpleDepthCNN",
            "VlnResnetDepthEncoder",
        ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder"
        if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN":
            self.depth_encoder = SimpleDepthCNN(
                observation_space, model_config.DEPTH_ENCODER.output_size
            )
        elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder":
            self.depth_encoder = VlnResnetDepthEncoder(
                observation_space,
                output_size=model_config.DEPTH_ENCODER.output_size,
                checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
                backbone=model_config.DEPTH_ENCODER.backbone,
            )

        # Init the RGB visual encoder
        assert model_config.RGB_ENCODER.cnn_type in [
            "SimpleRGBCNN",
            "TorchVisionResNet50",
        ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'."

        if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN":
            self.rgb_encoder = SimpleRGBCNN(
                observation_space, model_config.RGB_ENCODER.output_size
            )
        elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50":
            device = (
                torch.device("cuda", model_config.TORCH_GPU_ID)
                if torch.cuda.is_available()
                else torch.device("cpu")
            )
            self.rgb_encoder = TorchVisionResNet50(
                observation_space, model_config.RGB_ENCODER.output_size, device
            )

        if model_config.SEQ2SEQ.use_prev_action:
            self.prev_action_embedding = nn.Embedding(num_actions + 1, 32)

        # Init the RNN state decoder
        rnn_input_size = (
            self.instruction_encoder.output_size
            + model_config.DEPTH_ENCODER.output_size
            + model_config.RGB_ENCODER.output_size
        )

        if model_config.SEQ2SEQ.use_prev_action:
            rnn_input_size += self.prev_action_embedding.embedding_dim

        self.state_encoder = RNNStateEncoder(
            input_size=rnn_input_size,
            hidden_size=model_config.STATE_ENCODER.hidden_size,
            num_layers=1,
            rnn_type=model_config.STATE_ENCODER.rnn_type,
        )

        self.progress_monitor = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, 1
        )

        self._init_layers()

        # bert
        # import gzip
        # with gzip.open(model_config.INSTRUCTION_ENCODER.embedding_file, "rt") as f:
        #     import json
        #     embeddings = torch.tensor(json.load(f))
        # if model_config.INSTRUCTION_ENCODER.use_pretrained_embeddings:
        #     self.embedding_layer = nn.Embedding.from_pretrained(
        #         embeddings=embeddings,
        #         freeze=not model_config.INSTRUCTION_ENCODER.fine_tune_embeddings,
        #     )
        # else:  # each embedding initialized to sampled Gaussian
        #     self.embedding_layer = nn.Embedding(
        #         num_embeddings=model_config.INSTRUCTION_ENCODER.vocab_size,
        #         embedding_dim=model_config.INSTRUCTION_ENCODER.embedding_size,
        #         padding_idx=0,
        #     )

        # configuration = BertConfig(hidden_size=model_config.INSTRUCTION_ENCODER.hidden_size,
        #                            vocab_size_or_config_json_file=model_config.INSTRUCTION_ENCODER.vocab_size,
        #                            num_attention_heads=8,
        #                            )
        # self.bert = BertModel(configuration)
        import json
        with open("index_to_word.json") as f:
            self.idx_to_word = list(json.load(f)["word"].keys())
            print(self.idx_to_word)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.train()
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size,
        num_recurrent_layers,
        rnn_type,
        backbone,
        resnet_baseplanes,
        normalize_visual_inputs,
        obs_transform=ResizeCenterCropper(size=(256, 256)),
        force_blind_policy=False,
    ):
        super().__init__()

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        self._n_prev_action = 32
        rnn_input_size = self._n_prev_action

        if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid
                in observation_space.spaces):
            n_input_goal = (observation_space.spaces[
                IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0] + 1)
            self.tgt_embeding = nn.Linear(n_input_goal, 32)
            rnn_input_size += 32

        if ObjectGoalSensor.cls_uuid in observation_space.spaces:
            self._n_object_categories = (int(
                observation_space.spaces[ObjectGoalSensor.cls_uuid].high[0]) +
                                         1)
            self.obj_categories_embedding = nn.Embedding(
                self._n_object_categories, 32)
            rnn_input_size += 32

        if EpisodicGPSSensor.cls_uuid in observation_space.spaces:
            input_gps_dim = observation_space.spaces[
                EpisodicGPSSensor.cls_uuid].shape[0]
            self.gps_embedding = nn.Linear(input_gps_dim, 32)
            rnn_input_size += 32

        if PointGoalSensor.cls_uuid in observation_space.spaces:
            input_pointgoal_dim = observation_space.spaces[
                PointGoalSensor.cls_uuid].shape[0]
            self.pointgoal_embedding = nn.Linear(input_pointgoal_dim, 32)
            rnn_input_size += 32

        if HeadingSensor.cls_uuid in observation_space.spaces:
            input_heading_dim = (
                observation_space.spaces[HeadingSensor.cls_uuid].shape[0] + 1)
            assert input_heading_dim == 2, "Expected heading with 2D rotation."
            self.heading_embedding = nn.Linear(input_heading_dim, 32)
            rnn_input_size += 32

        if ProximitySensor.cls_uuid in observation_space.spaces:
            input_proximity_dim = observation_space.spaces[
                ProximitySensor.cls_uuid].shape[0]
            self.proximity_embedding = nn.Linear(input_proximity_dim, 32)
            rnn_input_size += 32

        if EpisodicCompassSensor.cls_uuid in observation_space.spaces:
            assert (observation_space.spaces[EpisodicCompassSensor.cls_uuid].
                    shape[0] == 1), "Expected compass with 2D rotation."
            input_compass_dim = 2  # cos and sin of the angle
            self.compass_embedding = nn.Linear(input_compass_dim, 32)
            rnn_input_size += 32

        if ImageGoalSensor.cls_uuid in observation_space.spaces:
            goal_observation_space = spaces.Dict(
                {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]})
            self.goal_visual_encoder = ResNetEncoder(
                goal_observation_space,
                baseplanes=resnet_baseplanes,
                ngroups=resnet_baseplanes // 2,
                make_backbone=getattr(resnet, backbone),
                normalize_visual_inputs=normalize_visual_inputs,
                obs_transform=obs_transform,
            )

            self.goal_visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.goal_visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

            rnn_input_size += hidden_size

        self._hidden_size = hidden_size

        self.visual_encoder = ResNetEncoder(
            observation_space if not force_blind_policy else spaces.Dict({}),
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
            obs_transform=obs_transform,
        )

        if not self.visual_encoder.is_blind:
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
Exemple #18
0
    def __init__(self, observation_space: Space, num_actions: int, model_config: Config, batch_size: int):
        super().__init__()
        self.model_config = model_config
        self.batch_size = batch_size
        device = (
            torch.device("cuda", model_config.TORCH_GPU_ID)
            if torch.cuda.is_available()
            else torch.device("cpu")
        )
        

        ## BERT Embedding
        self.embedding_layer = BertModel.from_pretrained('bert-base-uncased') 
        self.ins_fc =  nn.Linear(model_config.TRANSFORMER_INSTRUCTION_ENCODER.d_in, model_config.TRANSFORMER_INSTRUCTION_ENCODER.d_model)
        assert model_config.DEPTH_ENCODER.cnn_type in [
            "SimpleDepthCNN",
            "VlnResnetDepthEncoder",
        ], "DEPTH_ENCODER.cnn_type must be SimpleDepthCNN or VlnResnetDepthEncoder"
        if model_config.DEPTH_ENCODER.cnn_type == "SimpleDepthCNN":
            self.depth_encoder = SimpleDepthCNN(
                observation_space, model_config.DEPTH_ENCODER.output_size
            )
        elif model_config.DEPTH_ENCODER.cnn_type == "VlnResnetDepthEncoder":
            self.depth_encoder = VlnResnetDepthEncoder(
                observation_space,
                output_size=model_config.DEPTH_ENCODER.output_size,
                checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
                backbone=model_config.DEPTH_ENCODER.backbone,
                spatial_output=True,
            )

        # Init the RGB visual encoder
        assert model_config.RGB_ENCODER.cnn_type in [
            "SimpleRGBCNN",
            "TorchVisionResNet50",
        ], "RGB_ENCODER.cnn_type must be either 'SimpleRGBCNN' or 'TorchVisionResNet50'."

        if model_config.RGB_ENCODER.cnn_type == "SimpleRGBCNN":
            self.rgb_encoder = SimpleRGBCNN(
                observation_space, model_config.RGB_ENCODER.output_size
            )
        elif model_config.RGB_ENCODER.cnn_type == "TorchVisionResNet50":
            self.rgb_encoder = TorchVisionResNet50(
                observation_space, 
                model_config.RGB_ENCODER.output_size, 
                model_config.RGB_ENCODER.resnet_output_size, 
                device,
                spatial_output=True,
            )

        self.rgb_linear = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(
                self.rgb_encoder.output_shape[0],
                model_config.RGB_ENCODER.output_size,
            ),
            nn.ReLU(True),
        )

        self.depth_linear = nn.Sequential(
            nn.Flatten(),
            nn.Linear(
                np.prod(self.depth_encoder.output_shape),
                model_config.DEPTH_ENCODER.output_size,
            ),
            nn.ReLU(True),
        )

        self.rgb_kv = nn.Conv1d(
            self.rgb_encoder.output_shape[0],
            model_config.VISUAL_LING_ATTN.vis_in_features,
            1,
        )

        self.depth_kv = nn.Conv1d(
            self.depth_encoder.output_shape[0],
            model_config.VISUAL_LING_ATTN.vis_in_features,
            1,
        )
        self.image_cm_encoder = Visual_Ling_Attn(model_config.VISUAL_LING_ATTN)
        self.cross_pooler = nn.Sequential(nn.AdaptiveAvgPool1d(1),
                                          nn.Flatten())

        if model_config.SEQ2SEQ.use_prev_action:
            self.prev_action_embedding = nn.Embedding(num_actions + 1, 32)

        rnn_input_size = (
            self.model_config.IMAGE_CROSS_MODAL_ENCODER.d_model*2
            + model_config.DEPTH_ENCODER.output_size
            + model_config.RGB_ENCODER.output_size
        )

        if model_config.SEQ2SEQ.use_prev_action:
            rnn_input_size += self.prev_action_embedding.embedding_dim

        self.state_encoder = RNNStateEncoder(
            input_size=rnn_input_size,
            hidden_size=model_config.STATE_ENCODER.hidden_size,
            num_layers=1,
            rnn_type=model_config.STATE_ENCODER.rnn_type,
        )

        self.progress_monitor = nn.Linear(
            self.model_config.STATE_ENCODER.hidden_size, 1
        )
        self.linear = nn.Linear(self.model_config.STATE_ENCODER.hidden_size, num_actions)

        self._init_layers()