def __init__(self, observation_space, output_size, audiogoal_sensor): super(AudioCNN, self).__init__() self._n_input_audio = observation_space.spaces[audiogoal_sensor].shape[ 2] self._audiogoal_sensor = audiogoal_sensor cnn_dims = np.array( observation_space.spaces[audiogoal_sensor].shape[:2], dtype=np.float32) if cnn_dims[0] < 30 or cnn_dims[1] < 30: self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)] else: self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_audio, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=64, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), # nn.ReLU(True), Flatten(), nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) layer_init(self.cnn)
def __init__(self, observation_space, output_size, encode_rgb, encode_depth): super().__init__() if "rgb" in observation_space.spaces and encode_rgb: self._n_input_rgb = observation_space.spaces["rgb"].shape[2] else: self._n_input_rgb = 0 if "depth" in observation_space.spaces and encode_depth: self._n_input_depth = observation_space.spaces["depth"].shape[2] else: self._n_input_depth = 0 # kernel size for different CNN layers self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] if self._n_input_rgb > 0: cnn_dims = np.array(observation_space.spaces["rgb"].shape[:2], dtype=np.float32) elif self._n_input_depth > 0: cnn_dims = np.array(observation_space.spaces["depth"].shape[:2], dtype=np.float32) if self.is_blind: self.cnn = nn.Sequential() else: for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_rgb + self._n_input_depth, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=32, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), # nn.ReLU(True), Flatten(), nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) layer_init(self.cnn)
def __init__(self, observation_space, action_space, hidden_size, num_recurrent_layers, rnn_type, backbone, resnet_baseplanes, normalize_visual_inputs, obs_transform=ResizeCenterCropper(size=(256, 256)), force_blind_policy=False, use_category_input=False, has_distractor_sound=False): super().__init__() self._use_category_input = use_category_input self._hidden_size = hidden_size self._is_continuous = False if action_space.__class__.__name__ == "ActionSpace": self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) else: self.prev_action_embedding = nn.Linear(action_space.shape[0] + 1, 32) self._is_continuous = True self._n_prev_action = 32 rnn_input_size = self._n_prev_action if backbone == 'custom_resnet18': # self.visual_encoder = SMTCNN(observation_space) self.visual_encoder = VisualCNN(observation_space, hidden_size) else: self.visual_encoder = ResNetEncoder( observation_space if not force_blind_policy else spaces.Dict({}), baseplanes=resnet_baseplanes, ngroups=resnet_baseplanes // 2, make_backbone=getattr(resnet, backbone), normalize_visual_inputs=normalize_visual_inputs, obs_transform=obs_transform, ) if PoseSensor.cls_uuid in observation_space.spaces: self.pose_encoder = nn.Linear(5, 16) pose_feature_dims = 16 rnn_input_size += pose_feature_dims if SpectrogramSensor.cls_uuid in observation_space.spaces: self.audio_encoder = AudioCNN( observation_space, 128, SpectrogramSensor.cls_uuid, has_distractor_sound=has_distractor_sound) rnn_input_size += 128 else: logging.info("Input has no audio") if use_category_input: rnn_input_size += 21 if not self.visual_encoder.is_blind: self.visual_fc = nn.Sequential( Flatten(), nn.Linear(np.prod(self.visual_encoder.output_shape), hidden_size), nn.ReLU(True), ) self.state_encoder = RNNStateEncoder( (0 if self.is_blind else self._hidden_size) + rnn_input_size, self._hidden_size, rnn_type=rnn_type, num_layers=num_recurrent_layers, ) self.train()
def __init__(self, observation_space, output_size, audiogoal_sensor, has_distractor_sound=False): super().__init__() self._n_input_audio = observation_space.spaces[audiogoal_sensor].shape[ 2] self._audiogoal_sensor = audiogoal_sensor cnn_dims = np.array( observation_space.spaces[audiogoal_sensor].shape[:2], dtype=np.float32) self._has_distractor_sound = has_distractor_sound if has_distractor_sound: self._n_input_category = 21 print('Concatenate category label with spectrogram!') else: self._n_input_category = 0 if cnn_dims[0] < 30 or cnn_dims[1] < 30: self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)] else: self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] for kernel_size, stride in zip(self._cnn_layers_kernel_size, self._cnn_layers_stride): cnn_dims = self._conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_audio + self._n_input_category, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=64, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), # nn.ReLU(True), # nn.Conv2d( # in_channels=64, # out_channels=32, # kernel_size=self._cnn_layers_kernel_size[3], # stride=self._cnn_layers_stride[3], # ), # nn.ReLU(True), Flatten(), nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) self.layer_init()
def __init__(self, observation_space, output_size, map_type='gm'): super().__init__() self._map_type = map_type self._n_input_gm = observation_space.spaces[map_type].shape[2] cnn_dims = np.array( observation_space.spaces[map_type].shape[:2], dtype=np.float32 ) # input image of dimension N reduces to (ceil((N-f+1)/s),ceil((N-f+1)/s),Number of filters) # where f is the filter size and s is the stride length # kernel size for different CNN layers if self._map_type == 'gm': if cnn_dims[0] == 200: self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] else: assert cnn_dims[0] == 400 self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(5, 5), (4, 4), (2, 2)] elif self._map_type == 'am': self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] # strides for different CNN layers self._cnn_layers_stride = [(2, 2), (1, 1), (1, 1)] for kernel_size, stride in zip( self._cnn_layers_kernel_size, self._cnn_layers_stride ): cnn_dims = conv_output_dim( dimension=cnn_dims, padding=np.array([0, 0], dtype=np.float32), dilation=np.array([1, 1], dtype=np.float32), kernel_size=np.array(kernel_size, dtype=np.float32), stride=np.array(stride, dtype=np.float32), ) self.cnn = nn.Sequential( nn.Conv2d( in_channels=self._n_input_gm, out_channels=32, kernel_size=self._cnn_layers_kernel_size[0], stride=self._cnn_layers_stride[0], ), nn.ReLU(True), nn.Conv2d( in_channels=32, out_channels=64, kernel_size=self._cnn_layers_kernel_size[1], stride=self._cnn_layers_stride[1], ), nn.ReLU(True), nn.Conv2d( in_channels=64, out_channels=32, kernel_size=self._cnn_layers_kernel_size[2], stride=self._cnn_layers_stride[2], ), # nn.ReLU(True), Flatten(), nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), nn.ReLU(True), ) layer_init(self.cnn)