Exemple #1
0
    def __init__(self, observation_space, output_size, audiogoal_sensor):
        super(AudioCNN, self).__init__()
        self._n_input_audio = observation_space.spaces[audiogoal_sensor].shape[
            2]
        self._audiogoal_sensor = audiogoal_sensor

        cnn_dims = np.array(
            observation_space.spaces[audiogoal_sensor].shape[:2],
            dtype=np.float32)

        if cnn_dims[0] < 30 or cnn_dims[1] < 30:
            self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)]
            self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)]
        else:
            self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]
            self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                       self._cnn_layers_stride):
            cnn_dims = conv_output_dim(
                dimension=cnn_dims,
                padding=np.array([0, 0], dtype=np.float32),
                dilation=np.array([1, 1], dtype=np.float32),
                kernel_size=np.array(kernel_size, dtype=np.float32),
                stride=np.array(stride, dtype=np.float32),
            )

        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=self._n_input_audio,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[0],
                stride=self._cnn_layers_stride[0],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[1],
                stride=self._cnn_layers_stride[1],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[2],
                stride=self._cnn_layers_stride[2],
            ),
            #  nn.ReLU(True),
            Flatten(),
            nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size),
            nn.ReLU(True),
        )

        layer_init(self.cnn)
Exemple #2
0
    def __init__(self, observation_space, output_size, encode_rgb,
                 encode_depth):
        super().__init__()
        if "rgb" in observation_space.spaces and encode_rgb:
            self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
        else:
            self._n_input_rgb = 0

        if "depth" in observation_space.spaces and encode_depth:
            self._n_input_depth = observation_space.spaces["depth"].shape[2]
        else:
            self._n_input_depth = 0

        # kernel size for different CNN layers
        self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

        # strides for different CNN layers
        self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        if self._n_input_rgb > 0:
            cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2],
                                dtype=np.float32)
        elif self._n_input_depth > 0:
            cnn_dims = np.array(observation_space.spaces["depth"].shape[:2],
                                dtype=np.float32)

        if self.is_blind:
            self.cnn = nn.Sequential()
        else:
            for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                           self._cnn_layers_stride):
                cnn_dims = conv_output_dim(
                    dimension=cnn_dims,
                    padding=np.array([0, 0], dtype=np.float32),
                    dilation=np.array([1, 1], dtype=np.float32),
                    kernel_size=np.array(kernel_size, dtype=np.float32),
                    stride=np.array(stride, dtype=np.float32),
                )

            self.cnn = nn.Sequential(
                nn.Conv2d(
                    in_channels=self._n_input_rgb + self._n_input_depth,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[0],
                    stride=self._cnn_layers_stride[0],
                ),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=32,
                    out_channels=64,
                    kernel_size=self._cnn_layers_kernel_size[1],
                    stride=self._cnn_layers_stride[1],
                ),
                nn.ReLU(True),
                nn.Conv2d(
                    in_channels=64,
                    out_channels=32,
                    kernel_size=self._cnn_layers_kernel_size[2],
                    stride=self._cnn_layers_stride[2],
                ),
                #  nn.ReLU(True),
                Flatten(),
                nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size),
                nn.ReLU(True),
            )

        layer_init(self.cnn)
Exemple #3
0
    def __init__(self,
                 observation_space,
                 action_space,
                 hidden_size,
                 num_recurrent_layers,
                 rnn_type,
                 backbone,
                 resnet_baseplanes,
                 normalize_visual_inputs,
                 obs_transform=ResizeCenterCropper(size=(256, 256)),
                 force_blind_policy=False,
                 use_category_input=False,
                 has_distractor_sound=False):
        super().__init__()
        self._use_category_input = use_category_input
        self._hidden_size = hidden_size

        self._is_continuous = False
        if action_space.__class__.__name__ == "ActionSpace":
            self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32)
        else:
            self.prev_action_embedding = nn.Linear(action_space.shape[0] + 1,
                                                   32)
            self._is_continuous = True
        self._n_prev_action = 32
        rnn_input_size = self._n_prev_action

        if backbone == 'custom_resnet18':
            # self.visual_encoder = SMTCNN(observation_space)
            self.visual_encoder = VisualCNN(observation_space, hidden_size)
        else:
            self.visual_encoder = ResNetEncoder(
                observation_space
                if not force_blind_policy else spaces.Dict({}),
                baseplanes=resnet_baseplanes,
                ngroups=resnet_baseplanes // 2,
                make_backbone=getattr(resnet, backbone),
                normalize_visual_inputs=normalize_visual_inputs,
                obs_transform=obs_transform,
            )
        if PoseSensor.cls_uuid in observation_space.spaces:
            self.pose_encoder = nn.Linear(5, 16)
            pose_feature_dims = 16
            rnn_input_size += pose_feature_dims

        if SpectrogramSensor.cls_uuid in observation_space.spaces:
            self.audio_encoder = AudioCNN(
                observation_space,
                128,
                SpectrogramSensor.cls_uuid,
                has_distractor_sound=has_distractor_sound)
            rnn_input_size += 128
        else:
            logging.info("Input has no audio")

        if use_category_input:
            rnn_input_size += 21

        if not self.visual_encoder.is_blind:
            self.visual_fc = nn.Sequential(
                Flatten(),
                nn.Linear(np.prod(self.visual_encoder.output_shape),
                          hidden_size),
                nn.ReLU(True),
            )

        self.state_encoder = RNNStateEncoder(
            (0 if self.is_blind else self._hidden_size) + rnn_input_size,
            self._hidden_size,
            rnn_type=rnn_type,
            num_layers=num_recurrent_layers,
        )

        self.train()
    def __init__(self,
                 observation_space,
                 output_size,
                 audiogoal_sensor,
                 has_distractor_sound=False):
        super().__init__()
        self._n_input_audio = observation_space.spaces[audiogoal_sensor].shape[
            2]
        self._audiogoal_sensor = audiogoal_sensor

        cnn_dims = np.array(
            observation_space.spaces[audiogoal_sensor].shape[:2],
            dtype=np.float32)

        self._has_distractor_sound = has_distractor_sound
        if has_distractor_sound:
            self._n_input_category = 21
            print('Concatenate category label with spectrogram!')
        else:
            self._n_input_category = 0

        if cnn_dims[0] < 30 or cnn_dims[1] < 30:
            self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)]
            self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)]
        else:
            self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]
            self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]

        for kernel_size, stride in zip(self._cnn_layers_kernel_size,
                                       self._cnn_layers_stride):
            cnn_dims = self._conv_output_dim(
                dimension=cnn_dims,
                padding=np.array([0, 0], dtype=np.float32),
                dilation=np.array([1, 1], dtype=np.float32),
                kernel_size=np.array(kernel_size, dtype=np.float32),
                stride=np.array(stride, dtype=np.float32),
            )

        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=self._n_input_audio + self._n_input_category,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[0],
                stride=self._cnn_layers_stride[0],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[1],
                stride=self._cnn_layers_stride[1],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[2],
                stride=self._cnn_layers_stride[2],
            ),
            # nn.ReLU(True),
            # nn.Conv2d(
            #     in_channels=64,
            #     out_channels=32,
            #     kernel_size=self._cnn_layers_kernel_size[3],
            #     stride=self._cnn_layers_stride[3],
            # ),
            #  nn.ReLU(True),
            Flatten(),
            nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size),
            nn.ReLU(True),
        )

        self.layer_init()
Exemple #5
0
    def __init__(self, observation_space, output_size, map_type='gm'):
        super().__init__()
        self._map_type = map_type
        self._n_input_gm = observation_space.spaces[map_type].shape[2]

        cnn_dims = np.array(
            observation_space.spaces[map_type].shape[:2], dtype=np.float32
        )
        # input image of dimension N reduces to (ceil((N-f+1)/s),ceil((N-f+1)/s),Number of filters)
        # where f is the filter size and s is the stride length
        # kernel size for different CNN layers
        if self._map_type == 'gm':
            if cnn_dims[0] == 200:
                self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

                # strides for different CNN layers
                self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)]
            else:
                assert cnn_dims[0] == 400
                self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)]

                # strides for different CNN layers
                self._cnn_layers_stride = [(5, 5), (4, 4), (2, 2)]
        elif self._map_type == 'am':
            self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)]

            # strides for different CNN layers
            self._cnn_layers_stride = [(2, 2), (1, 1), (1, 1)]

        for kernel_size, stride in zip(
            self._cnn_layers_kernel_size, self._cnn_layers_stride
        ):
            cnn_dims = conv_output_dim(
                dimension=cnn_dims,
                padding=np.array([0, 0], dtype=np.float32),
                dilation=np.array([1, 1], dtype=np.float32),
                kernel_size=np.array(kernel_size, dtype=np.float32),
                stride=np.array(stride, dtype=np.float32),
            )

        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=self._n_input_gm,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[0],
                stride=self._cnn_layers_stride[0],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=self._cnn_layers_kernel_size[1],
                stride=self._cnn_layers_stride[1],
            ),
            nn.ReLU(True),
            nn.Conv2d(
                in_channels=64,
                out_channels=32,
                kernel_size=self._cnn_layers_kernel_size[2],
                stride=self._cnn_layers_stride[2],
            ),
            #  nn.ReLU(True),
            Flatten(),
            nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size),
            nn.ReLU(True),
        )

        layer_init(self.cnn)