def __init__(self, z_dim, initialize_weights=True): super().__init__() self.z_dim = z_dim self.conv_1 = conv2d(1, 16, kernel_size=7, stride=2) self.conv_2 = conv2d(16, 32, kernel_size=7, stride=2) self.conv_3 = conv2d(32, 32, kernel_size=7, stride=2) self.conv_4 = conv2d(32, 32, kernel_size=7, stride=2) self.flatten = Flatten() self.linear = nn.Linear(32 * 10 * 8, self.z_dim) if initialize_weights: init_weights(self.modules())
def __init__(self, z_dim, initailize_weights=True): """ Decodes the optical flow and optical flow mask. """ super().__init__() self.optical_flow_conv = conv2d(2 * z_dim, 64, kernel_size=1, stride=1) self.img_deconv6 = deconv(64, 64) self.img_deconv5 = deconv(64, 32) self.img_deconv4 = deconv(162, 32) self.img_deconv3 = deconv(98, 32) self.img_deconv2 = deconv(98, 32) self.predict_optical_flow6 = predict_flow(64) self.predict_optical_flow5 = predict_flow(162) self.predict_optical_flow4 = predict_flow(98) self.predict_optical_flow3 = predict_flow(98) self.predict_optical_flow2 = predict_flow(66) self.upsampled_optical_flow6_to_5 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False) self.upsampled_optical_flow5_to_4 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False) self.upsampled_optical_flow4_to_3 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False) self.upsampled_optical_flow3_to_2 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False) self.predict_optical_flow2_mask = nn.Conv2d(66, 1, kernel_size=3, stride=1, padding=1, bias=False) if initailize_weights: init_weights(self.modules())
def __init__(self, z_dim, initailize_weights=True): """ Image encoder taken from Making Sense of Vision and Touch """ super().__init__() self.z_dim = z_dim self.img_conv1 = conv2d(3, 16, kernel_size=7, stride=2) self.img_conv2 = conv2d(16, 32, kernel_size=5, stride=2) self.img_conv3 = conv2d(32, 64, kernel_size=5, stride=2) self.img_conv4 = conv2d(64, 64, stride=2) self.img_conv5 = conv2d(64, 128, stride=2) self.img_conv6 = conv2d(128, self.z_dim, stride=2) self.img_encoder = nn.Linear(4 * self.z_dim, 2 * self.z_dim) self.flatten = Flatten() if initailize_weights: init_weights(self.modules())
def __init__(self, z_dim, initailize_weights=True): """ Simplified Depth Encoder taken from Making Sense of Vision and Touch """ super().__init__() self.z_dim = z_dim self.depth_conv1 = conv2d(1, 32, kernel_size=3, stride=2) self.depth_conv2 = conv2d(32, 64, kernel_size=3, stride=2) self.depth_conv3 = conv2d(64, 64, kernel_size=4, stride=2) self.depth_conv4 = conv2d(64, 64, stride=2) self.depth_conv5 = conv2d(64, 128, stride=2) self.depth_conv6 = conv2d(128, self.z_dim, stride=2) self.depth_encoder = nn.Linear(16 * self.z_dim, 2 * self.z_dim) self.flatten = Flatten() if initailize_weights: init_weights(self.modules())
def test_same_shape_with_dilation(self): x = torch.randn(1, 1, 5, 5) conv = conv2d(1, 1, 3, dilation=2) with torch.no_grad(): out = conv(x) assert out.shape[2:] == x.shape[2:]