Beispiel #1
0
    def __init__(
        self,
        observation_space,
        hidden_size,
        goal_sensor_uuid=None,
        detach=False,
        additional_sensors=[] # low dim sensors corresponding to registered name
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.additional_sensors = additional_sensors
        self._n_input_goal = 0
        self._n_input_goal = 0
        # if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor":
        #     self.goal_sensor_uuid = goal_sensor_uuid
        #     self._initialize_goal_encoder(observation_space)
        self._hidden_size = hidden_size

        resnet_baseplanes = 32
        backbone="resnet18"
        visual_resnet = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
            obs_transform=ResizeCenterCropper(size=(256, 256)),
            backbone_only=True,
        )

        self.detach = detach
        self.visual_resnet = visual_resnet
        self.visual_encoder = nn.Sequential(
            Flatten(),
            nn.Linear(
                np.prod(visual_resnet.output_shape), hidden_size
            ),
            nn.Sigmoid()
        )

        self.visual_decoder = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(32, 3, kernel_size=3, stride=1, padding=1),
        )

        self.train()
Beispiel #2
0
    def __init__(
        self,
        observation_space,
        output_size=128,
        checkpoint="NONE",
        backbone="resnet50",
        resnet_baseplanes=32,
        normalize_visual_inputs=False,
        trainable=False,
        spatial_output: bool = False,
    ):
        super().__init__()

        self.visual_encoder = ResNetEncoder(
            spaces.Dict({"depth": observation_space.spaces["depth"]}),
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
            obs_transform=None,
        )

        for param in self.visual_encoder.parameters():
            param.requires_grad_(trainable)

        if checkpoint != "NONE":
            ddppo_weights = torch.load(checkpoint)

            weights_dict = {}
            for k, v in ddppo_weights["state_dict"].items():
                split_layer_name = k.split(".")[2:]
                if split_layer_name[0] != "visual_encoder":
                    continue

                layer_name = ".".join(split_layer_name[1:])
                weights_dict[layer_name] = v

            del ddppo_weights
            self.visual_encoder.load_state_dict(weights_dict, strict=True)

        self.spatial_output = spatial_output

        if not self.spatial_output:
            self.output_shape = (output_size, )
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          output_size),
                nn.ReLU(True),
            )
        else:
            self.spatial_embeddings = nn.Embedding(
                self.visual_encoder.output_shape[1] *
                self.visual_encoder.output_shape[2],
                64,
            )

            self.output_shape = list(self.visual_encoder.output_shape)
            self.output_shape[0] += self.spatial_embeddings.embedding_dim
            self.output_shape = tuple(self.output_shape)
Beispiel #3
0
 def __init__(
     self,
     observation_space,
     action_space,
     hidden_size,
     net=SingleBelief,
     aux_tasks=[],
     config=None,
     **kwargs,
 ):
     super().__init__(
         observation_space,
         action_space,
         hidden_size,
         net,
         aux_tasks=aux_tasks,
         config=config,
         **kwargs
     )
     self.medium = config.midlevel_medium
     self.visual_encoder = None
     self.visual_resize = nn.Sequential(
         Flatten(),
         nn.Linear(2048, hidden_size),
         nn.ReLU(True)
     )
Beispiel #4
0
    def _setup_net(self, rgb_input, feature_dim):
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=rgb_input,
                      out_channels=32,
                      kernel_size=3,
                      stride=2),
            nn.ReLU(True),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3,
                      stride=2),
            nn.ReLU(True),
            nn.Conv2d(in_channels=64,
                      out_channels=128,
                      kernel_size=3,
                      stride=2),
            nn.ReLU(True),
            nn.Conv2d(in_channels=128,
                      out_channels=256,
                      kernel_size=3,
                      stride=2),
            nn.ReLU(True),
        )
        self.flatten = nn.Sequential(
            Flatten(),
            nn.Linear(15 * 15 * 256, feature_dim),
            # nn.ReLU(True),
        )

        self._init_weight()
        print(self.cnn)
        _print_model_parameters(self.cnn)
    def __init__(
            self,
            observation_space,
            action_space,
            goal_sensor_uuid,
            hidden_size,
            num_recurrent_layers,
            rnn_type,
            backbone,
            resnet_baseplanes,
            normalize_visual_inputs,
            obs_transform=ResizeCenterCropper(size=(256, 256)),
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        self._n_prev_action = 32

        self._n_input_goal = (
            observation_space.spaces[self.goal_sensor_uuid].shape[0] + 1)
        self.tgt_embeding = nn.Linear(self._n_input_goal, 32)
        self._n_input_goal = 32

        self._hidden_size = hidden_size

        rnn_input_size = self._n_input_goal + self._n_prev_action
        self.visual_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
            obs_transform=obs_transform,
        )

        if not self.visual_encoder.is_blind:
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
Beispiel #6
0
    def _init_model(self, cnn_dims, output_size):
        r"""cnn_dims: initial cnn dimensions.
        """
        if self.is_blind:
            self.cnn = nn.Sequential()
            return

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                       self._cnn_layers_stride):
            cnn_dims = self._conv_output_dim(
                dimension=cnn_dims,
                padding=np.array([0, 0], dtype=np.float32),
                dilation=np.array([1, 1], dtype=np.float32),
                kernel_size=np.array(kernel_size, dtype=np.float32),
                stride=np.array(stride, dtype=np.float32),
            )

        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=self._n_input_rgb + self._n_input_depth,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[0],
                stride=self._cnn_layers_stride[0],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[1],
                stride=self._cnn_layers_stride[1],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=64,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[2],
                stride=self._cnn_layers_stride[2],
            ),
            Contiguous(),
            Flatten(),
            nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size),
            nn.ReLU(True),
        )
        self.layer_init()
Beispiel #7
0
    def forward(self, x):
        # 21 1 240 240
        x = self.maxpool(x)  # 21 1 120 120
        x = self.conv1(x)  # 21 8 30 30
        x = nn.ReLU()(x)

        x = self.maxpool(x)  # 21 8 15 15
        x = self.conv2(x)  # 21 16 8 8
        x = nn.ReLU()(x)

        x = self.maxpool(x)  # 21 16 4 4
        x = self.conv3(x)  # 21 32 2 2
        x = nn.ReLU()(x)

        x = Flatten()(x.contiguous())  # 21 128

        x = self.fc(x)  # 21 256

        return x
Beispiel #8
0
    def __init__(
        self,
        observation_space,
        hidden_size,
        goal_sensor_uuid=None,
        additional_sensors=[
        ]  # low dim sensors corresponding to registered name
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.additional_sensors = additional_sensors
        self._n_input_goal = 0
        self._n_input_goal = 0
        if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor":
            self.goal_sensor_uuid = goal_sensor_uuid
            self._initialize_goal_encoder(observation_space)
        self._hidden_size = hidden_size

        resnet_baseplanes = 32
        backbone = "resnet18"
        visual_resnet = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
        )
        self.visual_encoder = nn.Sequential(
            visual_resnet,
            Flatten(),
            nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
            nn.ReLU(True),
        )

        final_embedding_size = (0 if self.is_blind else
                                self._hidden_size) + self._n_input_goal
        for sensor in additional_sensors:
            final_embedding_size += observation_space.spaces[sensor].shape[0]

        self.state_encoder = RNNStateEncoder(final_embedding_size,
                                             self._hidden_size)
        self.train()
Beispiel #9
0
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size,
        net=SingleBelief,
        aux_tasks=[], # bruh are we even forwarding these things...
        config=None,
        **kwargs, # Note, we forward kwargs to the net
    ):
        assert issubclass(net, SingleBelief), "Belief policy must use belief net"
        super().__init__(net(
            observation_space=observation_space,
            hidden_size=hidden_size,
            config=config, # Forward
            **kwargs,
        ), action_space.n)
        self.aux_tasks = aux_tasks

        resnet_baseplanes = 32
        backbone="resnet18"

        visual_resnet = resnet.ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=config.use_mean_and_var
        )

        self.visual_encoder = nn.Sequential(
            visual_resnet,
            Flatten(),
            nn.Linear(
                np.prod(visual_resnet.output_shape), hidden_size
            ),
            nn.ReLU(True),
        )
Beispiel #10
0
    def forward(self, x):
        # print("x: ", x.shape) # 1 23 480 480
        # x = self.maxpool(x) # 1 23 240 240
        x = self.conv1(x)
        x = nn.ReLU()(x)
        # print("x: ", x.shape) # 1 32 240 240

        x = self.maxpool(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        # print("x: ", x.shape) # 1 64 120 120

        x = self.maxpool(x)
        x = self.conv3(x)
        x = nn.ReLU()(x)
        # print("x: ", x.shape) # 1 128 60 60

        x = self.maxpool(x)
        x = self.conv4(x)
        x = nn.ReLU()(x)
        # print("x: ", x.shape) # 1 64 30 30

        x = self.maxpool(x)
        x = self.conv5(x)
        x = nn.ReLU()(x)
        # print("x: ", x.shape) # 1 32 15 15
        # print("x: ", x.shape) # 1 32 7 7

        x = Flatten()(x.contiguous())

        # print("x: ", x.shape) # 1*7200

        x = self.fc(x)  # 1*512
        x = nn.ReLU()(x)

        return x
Beispiel #11
0
    def __init__(
        self,
        observation_space,
        action_space,
        goal_sensor_uuid,
        hidden_size,
        num_recurrent_layers,
        rnn_type,
        backbone,
        resnet_baseplanes,
        normalize_visual_inputs,
        use_info_bot,
        use_odometry,
    ):
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self._hidden_size = hidden_size

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, hidden_size)
        self._n_prev_action = self.prev_action_embedding.embedding_dim

        self._n_input_goal = observation_space.spaces[self.goal_sensor_uuid].shape[0]
        self._tgt_proj = nn.Linear(self._n_input_goal, hidden_size)
        self._n_input_goal = 32

        self.ib = True
        self.use_info_bot = use_info_bot
        self.use_odometry = use_odometry

        if self.ib:
            self.bottleneck = VIBCompleteLayer(self._hidden_size, self._n_input_goal, self.use_info_bot, self.use_odometry)

        self.visual_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
        )

        if not self.visual_encoder.is_blind:
            after_compression_flat_size = 2048
            num_compression_channels = int(
                round(
                    after_compression_flat_size
                    / (
                        self.visual_encoder.output_shape[1]
                        * self.visual_encoder.output_shape[2]
                    )
                )
            )
            self.compression = nn.Sequential(
                resnet.BasicBlock(
                    self.visual_encoder.output_shape[0],
                    self.visual_encoder.output_shape[0],
                    1,
                ),
                resnet.BasicBlock(
                    self.visual_encoder.output_shape[0],
                    num_compression_channels,
                    1,
                    downsample=nn.Conv2d(
                        self.visual_encoder.output_shape[0], num_compression_channels, 1
                    ),
                ),
            )

            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(
                    np.prod(self.visual_encoder.compression_shape),
                    self._hidden_size - self._hidden_size // 4,
                    bias=False,
                ),
                nn.LayerNorm(self._hidden_size - self._hidden_size // 4),
                nn.ReLU(True),
            )

            self.visual_flow_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(
                    np.prod(self.visual_encoder.compression_shape),
                    self._hidden_size // 2,
                    bias=False,
                ),
                nn.LayerNorm(self._hidden_size // 2),
                nn.ReLU(True),
                nn.Linear(self._hidden_size // 2, self._hidden_size // 4, bias=False),
                nn.LayerNorm(self._hidden_size // 4),
                nn.ReLU(True),
            )

            self.delta_egomotion_predictor = nn.Linear(self._hidden_size // 4, 3)

        if rnn_type != "transformer":
            self.state_encoder = RNNStateEncoder(
                self._hidden_size,
                self._hidden_size,
                rnn_type=rnn_type,
                num_layers=num_recurrent_layers,
            )
        else:
            self.state_encoder = TransformerStateEncoder(
                input_size=self._hidden_size, d_model=self._hidden_size
            )

        self.goal_mem_layer = nn.Sequential(
            nn.Linear(
                self._hidden_size + (self._n_input_goal if self.ib else 0),
                self.output_size,
            ),
            nn.ReLU(True),
        )

        self.pg_with_gps_pred = nn.Sequential(
            nn.Linear(self._hidden_size, self._hidden_size // 2),
            nn.ReLU(True),
            nn.Linear(self._hidden_size // 2, 3),
        )

        self.train()

        self.register_buffer("ego_error_threshold", torch.tensor([[0.01]]))
    def __init__(self, observation_space, output_size, detector):
        super().__init__()
        self.detector = detector

        if "rgb" in observation_space.spaces:
            self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
        else:
            self._n_input_rgb = 0

        if "depth" in observation_space.spaces:
            self._n_input_depth = observation_space.spaces["depth"].shape[2]
        else:
            self._n_input_depth = 0

        self._no_classes = observation_space.spaces["goalclass"].shape[0]
        self._detector_channels = 765 // (3 * 3)

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        if self._n_input_rgb > 0:
            cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2],
                                dtype=np.float32)
        elif self._n_input_depth > 0:
            cnn_dims = np.array(observation_space.spaces["depth"].shape[:2],
                                dtype=np.float32)

        if self.is_blind:
            self.cnn_1 = nn.Sequential()
            self.cnn_2 = nn.Sequential()
        else:
            for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                           self._cnn_layers_stride):
                cnn_dims = self._conv_output_dim(
                    dimension=cnn_dims,
                    padding=np.array([0, 0], dtype=np.float32),
                    dilation=np.array([1, 1], dtype=np.float32),
                    kernel_size=np.array(kernel_size, dtype=np.float32),
                    stride=np.array(stride, dtype=np.float32),
                )

            self.cnn_1 = nn.Sequential(
                nn.Conv2d(
                    in_channels=self._n_input_rgb + self._n_input_depth,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[0],
                    stride=self._cnn_layers_stride[0],
                ), nn.ReLU(True),
                nn.Conv2d(
                    in_channels=32,
                    out_channels=64,
                    kernel_size=self._cnn_layers_kernel_size[1],
                    stride=self._cnn_layers_stride[1],
                ), nn.ReLU(True))

            self.detector_cnn = nn.Sequential(
                nn.Conv2d(
                    in_channels=self._detector_channels + self._no_classes,
                    out_channels=64,
                    kernel_size=1,
                    stride=1,
                ),
                nn.ReLU(True),
            )

            self.cnn_2 = nn.Sequential(
                nn.Conv2d(
                    in_channels=64 + 64,
                    out_channels=128,
                    kernel_size=self._cnn_layers_kernel_size[2],
                    stride=self._cnn_layers_stride[2],
                ),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=128,
                    out_channels=32,
                    kernel_size=1,
                    stride=1,
                ),
                #  nn.ReLU(True),
                Flatten(),
                nn.Linear(32 * (cnn_dims[0] + 2) * (cnn_dims[1] + 2),
                          output_size),
                nn.ReLU(True),
            )
        self.layer_init()
Beispiel #13
0
    def __init__(
        self,
        observation_space,
        hidden_size,
        goal_sensor_uuid=None,
        detach=False,
        imagenet=False,
        additional_sensors=[
        ]  # low dim sensors corresponding to registered name
    ):
        self.detach = detach
        self.imagenet = imagenet
        super().__init__()
        self.goal_sensor_uuid = goal_sensor_uuid
        self.additional_sensors = additional_sensors
        self._n_input_goal = 0
        self._n_input_goal = 0
        if goal_sensor_uuid is not None and goal_sensor_uuid != "no_sensor":
            self.goal_sensor_uuid = goal_sensor_uuid
            self._initialize_goal_encoder(observation_space)
        self._hidden_size = hidden_size

        resnet_baseplanes = 64
        backbone = "resnet50"
        # backbone="resnet18"

        if imagenet:
            visual_resnet = TorchVisionResNet50()
            visual_resnet.eval()
        else:
            visual_resnet = ResNetEncoder(
                observation_space,
                baseplanes=resnet_baseplanes,
                ngroups=resnet_baseplanes // 2,
                make_backbone=getattr(resnet, backbone),
                normalize_visual_inputs=False,
            )

        self.detach = detach

        self.model_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
            dense=True,
        )

        self.target_encoder = ResNetEncoder(
            observation_space,
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=False,
            dense=True,
        )

        self.visual_resnet = visual_resnet

        if imagenet:
            self.visual_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(2048, hidden_size),
                nn.ReLU(True),
            )

            self.target_image_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(2048, hidden_size),
                nn.ReLU(True),
            )
        else:
            self.visual_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
                nn.ReLU(True),
            )

            self.target_image_encoder = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(visual_resnet.output_shape), hidden_size),
                nn.ReLU(True),
            )

        final_embedding_size = (0 if self.is_blind else
                                self._hidden_size) + self._n_input_goal
        for sensor in additional_sensors:
            final_embedding_size += observation_space.spaces[sensor].shape[0]

        if self.goal_sensor_uuid == 'imagegoal':
            final_embedding_size = 1024

        self.state_encoder = nn.Sequential(
            nn.Linear(final_embedding_size, hidden_size), nn.ReLU(True),
            nn.Linear(hidden_size, hidden_size))
        self.state_policy_encoder = RNNStateEncoder(final_embedding_size,
                                                    self._hidden_size)
        self.train()
Beispiel #14
0
    def _init_perception_model(self, observation_space):
        if "rgb" in observation_space.spaces:
            self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
        else:
            self._n_input_rgb = 0

        if "depth" in observation_space.spaces:
            self._n_input_depth = observation_space.spaces["depth"].shape[2]
        else:
            self._n_input_depth = 0

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        if self._n_input_rgb > 0:
            cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2],
                                dtype=np.float32)
        elif self._n_input_depth > 0:
            cnn_dims = np.array(observation_space.spaces["depth"].shape[:2],
                                dtype=np.float32)

        if self.is_blind:
            return nn.Sequential()
        else:
            for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                           self._cnn_layers_stride):
                cnn_dims = self._conv_output_dim(
                    dimension=cnn_dims,
                    padding=np.array([0, 0], dtype=np.float32),
                    dilation=np.array([1, 1], dtype=np.float32),
                    kernel_size=np.array(kernel_size, dtype=np.float32),
                    stride=np.array(stride, dtype=np.float32),
                )

            return nn.Sequential(
                nn.Conv2d(
                    in_channels=self._n_input_rgb + self._n_input_depth,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[0],
                    stride=self._cnn_layers_stride[0],
                ),
                nn.ReLU(),
                nn.Conv2d(
                    in_channels=32,
                    out_channels=64,
                    kernel_size=self._cnn_layers_kernel_size[1],
                    stride=self._cnn_layers_stride[1],
                ),
                nn.ReLU(),
                nn.Conv2d(
                    in_channels=64,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[2],
                    stride=self._cnn_layers_stride[2],
                ),
                Flatten(),
                nn.Linear(32 * cnn_dims[0] * cnn_dims[1], self._hidden_size),
                nn.ReLU(),
            )
    def __init__(self,
                 observation_space,
                 output_size,
                 drop_prob=0.5,
                 channel_scale=1):
        super().__init__()
        if "rgb" in observation_space.spaces:
            self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
        else:
            self._n_input_rgb = 0

        if "depth" in observation_space.spaces:
            self._n_input_depth = observation_space.spaces["depth"].shape[2]
        else:
            self._n_input_depth = 0

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        self._drop_prob = drop_prob

        print("i am here---" * 100)
        if self._n_input_rgb > 0:
            cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2],
                                dtype=np.float32)
        elif self._n_input_depth > 0:
            cnn_dims = np.array(observation_space.spaces["depth"].shape[:2],
                                dtype=np.float32)

        if self.is_blind:
            self.cnn = nn.Sequential()
        else:
            for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                           self._cnn_layers_stride):
                cnn_dims = self._conv_output_dim(
                    dimension=cnn_dims,
                    padding=np.array([0, 0], dtype=np.float32),
                    dilation=np.array([1, 1], dtype=np.float32),
                    kernel_size=np.array(kernel_size, dtype=np.float32),
                    stride=np.array(stride, dtype=np.float32),
                )

            ds = channel_scale

            self.cnn = nn.Sequential(
                nn.Conv2d(
                    in_channels=self._n_input_rgb + self._n_input_depth,
                    out_channels=32 * ds,
                    kernel_size=self._cnn_layers_kernel_size[0],
                    stride=self._cnn_layers_stride[0],
                ),
                nn.BatchNorm2d(32 * ds, affine=False),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=32 * ds,
                    out_channels=64 * ds,
                    kernel_size=self._cnn_layers_kernel_size[1],
                    stride=self._cnn_layers_stride[1],
                ),
                nn.BatchNorm2d(64 * ds, affine=False),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=64 * ds,
                    out_channels=32 * ds,
                    kernel_size=self._cnn_layers_kernel_size[2],
                    stride=self._cnn_layers_stride[2],
                ),
                nn.BatchNorm2d(32 * ds, affine=False),
                nn.ELU(True),
                Flatten(),
                nn.Linear(32 * ds * cnn_dims[0] * cnn_dims[1], output_size),
                nn.BatchNorm1d(output_size, affine=False),
                nn.ReLU(True),
            )

        self.layer_init()
    def __init__(
        self,
        observation_space,
        action_space,
        hidden_size,
        num_recurrent_layers,
        rnn_type,
        backbone,
        resnet_baseplanes,
        normalize_visual_inputs,
        obs_transform=ResizeCenterCropper(size=(256, 256)),
        force_blind_policy=False,
    ):
        super().__init__()

        self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        self._n_prev_action = 32
        rnn_input_size = self._n_prev_action

        if (IntegratedPointGoalGPSAndCompassSensor.cls_uuid
                in observation_space.spaces):
            n_input_goal = (observation_space.spaces[
                IntegratedPointGoalGPSAndCompassSensor.cls_uuid].shape[0] + 1)
            self.tgt_embeding = nn.Linear(n_input_goal, 32)
            rnn_input_size += 32

        if ObjectGoalSensor.cls_uuid in observation_space.spaces:
            self._n_object_categories = (int(
                observation_space.spaces[ObjectGoalSensor.cls_uuid].high[0]) +
                                         1)
            self.obj_categories_embedding = nn.Embedding(
                self._n_object_categories, 32)
            rnn_input_size += 32

        if EpisodicGPSSensor.cls_uuid in observation_space.spaces:
            input_gps_dim = observation_space.spaces[
                EpisodicGPSSensor.cls_uuid].shape[0]
            self.gps_embedding = nn.Linear(input_gps_dim, 32)
            rnn_input_size += 32

        if PointGoalSensor.cls_uuid in observation_space.spaces:
            input_pointgoal_dim = observation_space.spaces[
                PointGoalSensor.cls_uuid].shape[0]
            self.pointgoal_embedding = nn.Linear(input_pointgoal_dim, 32)
            rnn_input_size += 32

        if HeadingSensor.cls_uuid in observation_space.spaces:
            input_heading_dim = (
                observation_space.spaces[HeadingSensor.cls_uuid].shape[0] + 1)
            assert input_heading_dim == 2, "Expected heading with 2D rotation."
            self.heading_embedding = nn.Linear(input_heading_dim, 32)
            rnn_input_size += 32

        if ProximitySensor.cls_uuid in observation_space.spaces:
            input_proximity_dim = observation_space.spaces[
                ProximitySensor.cls_uuid].shape[0]
            self.proximity_embedding = nn.Linear(input_proximity_dim, 32)
            rnn_input_size += 32

        if EpisodicCompassSensor.cls_uuid in observation_space.spaces:
            assert (observation_space.spaces[EpisodicCompassSensor.cls_uuid].
                    shape[0] == 1), "Expected compass with 2D rotation."
            input_compass_dim = 2  # cos and sin of the angle
            self.compass_embedding = nn.Linear(input_compass_dim, 32)
            rnn_input_size += 32

        if ImageGoalSensor.cls_uuid in observation_space.spaces:
            goal_observation_space = spaces.Dict(
                {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]})
            self.goal_visual_encoder = ResNetEncoder(
                goal_observation_space,
                baseplanes=resnet_baseplanes,
                ngroups=resnet_baseplanes // 2,
                make_backbone=getattr(resnet, backbone),
                normalize_visual_inputs=normalize_visual_inputs,
                obs_transform=obs_transform,
            )

            self.goal_visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.goal_visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

            rnn_input_size += hidden_size

        self._hidden_size = hidden_size

        self.visual_encoder = ResNetEncoder(
            observation_space if not force_blind_policy else spaces.Dict({}),
            baseplanes=resnet_baseplanes,
            ngroups=resnet_baseplanes // 2,
            make_backbone=getattr(resnet, backbone),
            normalize_visual_inputs=normalize_visual_inputs,
            obs_transform=obs_transform,
        )

        if not self.visual_encoder.is_blind:
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
Beispiel #17
0
    def __init__(
            self,
            observation_space,
            output_size,
            obs_transform: nn.Module = ResizeCenterCropper(size=(256, 256)),
    ):
        super().__init__()

        self.obs_transform = obs_transform
        if self.obs_transform is not None:
            observation_space = obs_transform.transform_observation_space(
                observation_space)

        if "rgb" in observation_space.spaces:
            self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
        else:
            self._n_input_rgb = 0

        if "depth" in observation_space.spaces:
            self._n_input_depth = observation_space.spaces["depth"].shape[2]
        else:
            self._n_input_depth = 0

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        if self._n_input_rgb > 0:
            cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2],
                                dtype=np.float32)
        elif self._n_input_depth > 0:
            cnn_dims = np.array(observation_space.spaces["depth"].shape[:2],
                                dtype=np.float32)

        if self.is_blind:
            self.cnn = nn.Sequential()
        else:
            for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                           self._cnn_layers_stride):
                cnn_dims = self._conv_output_dim(
                    dimension=cnn_dims,
                    padding=np.array([0, 0], dtype=np.float32),
                    dilation=np.array([1, 1], dtype=np.float32),
                    kernel_size=np.array(kernel_size, dtype=np.float32),
                    stride=np.array(stride, dtype=np.float32),
                )

            self.cnn = nn.Sequential(
                nn.Conv2d(
                    in_channels=self._n_input_rgb + self._n_input_depth,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[0],
                    stride=self._cnn_layers_stride[0],
                ),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=32,
                    out_channels=64,
                    kernel_size=self._cnn_layers_kernel_size[1],
                    stride=self._cnn_layers_stride[1],
                ),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=64,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[2],
                    stride=self._cnn_layers_stride[2],
                ),
                #  nn.ReLU(True),
                Flatten(),
                nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size),
                nn.ReLU(True),
            )

        self.layer_init()