Ejemplo n.º 1
0
    def __init__(self, vp_value_count, output_shape, name='Full Network'):
        """
        Initializes the Full Network.
        :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape.
                              Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112)
        :param name: (str, optional) The name of the network (default 'Full Network').
        Raises:
            ValueError: if 'vp_value_count' is not a legal value count
            ValueError: if 'output_shape' does not contain a legal number of frames.
        """
        if vp_value_count not in self.VALID_VP_VALUE_COUNTS:
            raise ValueError('Invalid number of vp values: %d' % vp_value_count)
        if output_shape[2] not in self.VALID_FRAME_COUNTS:
            raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2])

        super(FullNetwork, self).__init__()

        self.net_name = name
        self.vp_value_count = vp_value_count
        self.output_shape = output_shape
        self.out_frames = output_shape[2]
        self.rep_channels = 256
        self.rep_frames = 4
        self.rep_size = 14

        self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path)
        self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames,
                                pretrained=True, weights_path=i3d_weights_path)

        self.exp = Expander(vp_value_count=self.vp_value_count, out_frames=self.rep_frames, out_size=self.rep_size)
        self.trans = Transformer(in_channels=self.rep_channels + self.vp_value_count, out_channels=self.rep_channels)

        self.gen = Generator(in_channels=[self.rep_channels, self.rep_channels], out_frames=self.out_frames)
Ejemplo n.º 2
0
    def __init__(self,
                 vp_value_count,
                 stdev,
                 output_shape,
                 pretrained=False,
                 vgg_weights_path='',
                 i3d_weights_path='',
                 name='Full Network'):
        """
        Initializes the Full Network.
        :param vp_value_count: (int) The number of values that identify the viewpoint.
        :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape.
                              Legal values: (bsz, 3, 8/16, 112, 112) and (bsz, 3, 16, 112, 112)
        :param name: (str, optional) The name of the network (default 'Full Network').
        Raises:
            ValueError: if 'vp_value_count' is not a legal value count
            ValueError: if 'output_shape' does not contain a legal number of frames.
        """
        if vp_value_count not in self.VALID_VP_VALUE_COUNTS:
            raise ValueError('Invalid number of vp values: %d' %
                             vp_value_count)
        if output_shape[2] not in self.VALID_FRAME_COUNTS:
            raise ValueError('Invalid number of frames in desired output: %d' %
                             output_shape[2])

        super(FullNetwork, self).__init__()

        self.net_name = name
        self.vp_value_count = vp_value_count
        self.stdev = stdev
        self.output_shape = output_shape
        self.out_frames = output_shape[2]

        # specs of various features
        self.app_feat = 128
        self.rep_feat = 128
        self.rep_frames = 4
        self.rep_size = 14
        self.nkp = 32

        self.vgg = vgg16(pretrained=pretrained, weights_path=vgg_weights_path)
        self.i3d = InceptionI3d(final_endpoint='Mixed_5c',
                                in_frames=self.out_frames,
                                pretrained=pretrained,
                                weights_path=i3d_weights_path)

        self.exp = Expander(vp_value_count=self.vp_value_count)

        # convs to make all appearance encodings have same number of channels, so they can be used in the same convGRU
        self.app_conv128 = nn.Conv2d(in_channels=128,
                                     out_channels=self.app_feat,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(1, 1))
        self.app_conv256a = nn.Conv2d(in_channels=256,
                                      out_channels=self.app_feat,
                                      kernel_size=(3, 3),
                                      stride=(1, 1),
                                      padding=(1, 1))
        self.app_conv256b = nn.Conv2d(in_channels=256,
                                      out_channels=self.app_feat,
                                      kernel_size=(3, 3),
                                      stride=(1, 1),
                                      padding=(1, 1))
        self.app_convs = [
            nn.Sequential(self.app_conv128, nn.ReLU(inplace=True)),
            nn.Sequential(self.app_conv256a, nn.ReLU(inplace=True)),
            nn.Sequential(self.app_conv256b, nn.ReLU(inplace=True))
        ]

        # convs to make all motion features have the same number of channels, so they can be used in the same trans net
        self.rep_conv64 = nn.Conv3d(in_channels=64,
                                    out_channels=self.rep_feat,
                                    kernel_size=(3, 3, 3),
                                    stride=(1, 1, 1),
                                    padding=(1, 1, 1))
        self.rep_conv192 = nn.Conv3d(in_channels=192,
                                     out_channels=self.rep_feat,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_conv256 = nn.Conv3d(in_channels=256,
                                     out_channels=self.rep_feat,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_convs = [
            nn.Sequential(self.rep_conv64, nn.ReLU(inplace=True)),
            nn.Sequential(self.rep_conv192, nn.ReLU(inplace=True)),
            nn.Sequential(self.rep_conv256, nn.ReLU(inplace=True))
        ]

        self.trans = Transformer(in_channels=self.rep_feat +
                                 self.vp_value_count,
                                 out_channels=self.rep_feat)

        self.kpp = KPPredictor(in_channels=self.rep_feat,
                               nkp=self.nkp,
                               stdev=self.stdev)

        self.vpp = VPPredictor(in_channels=256)

        self.gru = ConvGRU(input_dim=self.rep_feat,
                           hidden_dim=[self.app_feat],
                           kernel_size=(7, 7),
                           num_layers=1,
                           batch_first=True,
                           bias=False,
                           return_all_layers=False)

        self.gen = Generator(in_channels=[self.app_feat, self.nkp],
                             out_frames=self.out_frames)
Ejemplo n.º 3
0
    def __init__(self, vp_value_count, output_shape, name='Full Network'):
        """
        Initializes the Full Network.
        :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape.
                              Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112)
        :param name: (str, optional) The name of the network (default 'Full Network').
        Raises:
            ValueError: if 'vp_value_count' is not a legal value count
            ValueError: if 'output_shape' does not contain a legal number of frames.
        """
        if vp_value_count not in self.VALID_VP_VALUE_COUNTS:
            raise ValueError('Invalid number of vp values: %d' %
                             vp_value_count)
        if output_shape[2] not in self.VALID_FRAME_COUNTS:
            raise ValueError('Invalid number of frames in desired output: %d' %
                             output_shape[2])

        super(FullNetwork, self).__init__()

        # params
        self.net_name = name
        self.vp_value_count = vp_value_count
        self.output_shape = output_shape
        self.out_frames = output_shape[2]
        self.rep_feat = 128
        self.app_feat = 256

        # networks
        self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path)
        self.i3d = InceptionI3d(final_endpoint='Mixed_5c',
                                in_frames=self.out_frames,
                                pretrained=True,
                                weights_path=i3d_weights_path)
        self.exp = Expander(vp_value_count=self.vp_value_count)
        self.trans = Transformer(in_channels=self.rep_feat +
                                 self.vp_value_count,
                                 out_channels=self.rep_feat)
        self.gen = Generator(in_channels=[self.app_feat, self.rep_feat],
                             out_frames=self.out_frames)

        self.conv_lstms = {
            56:
            ConvLSTM(input_dim=self.rep_feat,
                     hidden_dim=[self.app_feat],
                     kernel_size=(3, 3),
                     num_layers=1,
                     in_shape=(56, 56),
                     batch_first=True,
                     bias=False,
                     return_all_layers=False),
            28:
            ConvLSTM(input_dim=self.rep_feat,
                     hidden_dim=[self.app_feat],
                     kernel_size=(3, 3),
                     num_layers=1,
                     in_shape=(28, 28),
                     batch_first=True,
                     bias=False,
                     return_all_layers=False),
            14:
            ConvLSTM(input_dim=self.rep_feat,
                     hidden_dim=[self.app_feat],
                     kernel_size=(3, 3),
                     num_layers=1,
                     in_shape=(14, 14),
                     batch_first=True,
                     bias=False,
                     return_all_layers=False)
        }

        # convs
        self.app_conv128 = nn.Conv2d(in_channels=128,
                                     out_channels=self.app_feat,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(1, 1))
        self.app_conv256 = nn.Conv2d(in_channels=256,
                                     out_channels=self.app_feat,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(1, 1))
        self.app_conv512 = nn.Conv2d(in_channels=512,
                                     out_channels=self.app_feat,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(1, 1))
        self.app_convs = {
            128: self.app_conv128,
            256: self.app_conv256,
            512: self.app_conv512
        }

        self.hconv = nn.Conv2d(in_channels=self.app_feat,
                               out_channels=128,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=(1, 1))
        self.cconv = nn.Conv2d(in_channels=self.app_feat,
                               out_channels=128,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=(1, 1))

        self.rep_conv64 = nn.Conv3d(in_channels=64,
                                    out_channels=self.rep_feat,
                                    kernel_size=(3, 3, 3),
                                    stride=(1, 1, 1),
                                    padding=(1, 1, 1))
        self.rep_conv192 = nn.Conv3d(in_channels=192,
                                     out_channels=self.rep_feat,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_conv256 = nn.Conv3d(in_channels=256,
                                     out_channels=self.rep_feat,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_convs = {
            64: self.rep_conv64,
            192: self.rep_conv192,
            256: self.rep_conv256
        }
Ejemplo n.º 4
0
    def __init__(self, vp_value_count, output_shape, name='Full Network'):
        """
        Initializes the Full Network.
        :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape.
                              Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112)
        :param name: (str, optional) The name of the network (default 'Full Network').
        Raises:
            ValueError: if 'vp_value_count' is not a legal value count
            ValueError: if 'output_shape' does not contain a legal number of frames.
        """
        if vp_value_count not in self.VALID_VP_VALUE_COUNTS:
            raise ValueError('Invalid number of vp values: %d' %
                             vp_value_count)
        if output_shape[2] not in self.VALID_FRAME_COUNTS:
            raise ValueError('Invalid number of frames in desired output: %d' %
                             output_shape[2])

        super(FullNetwork, self).__init__()

        self.net_name = name
        self.vp_value_count = vp_value_count
        self.output_shape = output_shape
        self.out_frames = output_shape[2]
        self.rep_channels = 256
        self.rep_frames = 4
        self.rep_size = 14

        self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path)
        self.i3d = InceptionI3d(final_endpoint='Mixed_5c',
                                in_frames=self.out_frames,
                                pretrained=True,
                                weights_path=i3d_weights_path)

        self.exp = Expander(vp_value_count=self.vp_value_count)

        # convs to make all appearance encoding have same number of channels, so they can be used in the same convLSTM
        self.app_conv128 = nn.Conv2d(in_channels=128,
                                     out_channels=256,
                                     kernel_size=(3, 3),
                                     stride=(1, 1),
                                     padding=(1, 1))
        self.app_conv256a = nn.Conv2d(in_channels=256,
                                      out_channels=256,
                                      kernel_size=(3, 3),
                                      stride=(1, 1),
                                      padding=(1, 1))
        self.app_conv256b = nn.Conv2d(in_channels=512,
                                      out_channels=256,
                                      kernel_size=(3, 3),
                                      stride=(1, 1),
                                      padding=(1, 1))
        self.app_convs = [
            self.app_conv128, self.app_conv256a, self.app_conv256b
        ]

        # convs for the initial hidden and current states of the convLSTM
        self.hconv = nn.Conv2d(in_channels=256,
                               out_channels=128,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=(1, 1))
        self.cconv = nn.Conv2d(in_channels=256,
                               out_channels=128,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=(1, 1))

        # convs to make all motion features have the same number of channels, so they can be used in the same Trans Net
        self.rep_conv64 = nn.Conv3d(in_channels=64,
                                    out_channels=256,
                                    kernel_size=(3, 3, 3),
                                    stride=(1, 1, 1),
                                    padding=(1, 1, 1))
        self.rep_conv192 = nn.Conv3d(in_channels=192,
                                     out_channels=256,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_conv256 = nn.Conv3d(in_channels=256,
                                     out_channels=256,
                                     kernel_size=(3, 3, 3),
                                     stride=(1, 1, 1),
                                     padding=(1, 1, 1))
        self.rep_convs = {
            64: self.rep_conv64,
            192: self.rep_conv192,
            256: self.rep_conv256
        }

        self.trans = Transformer(in_channels=256 + self.vp_value_count,
                                 out_channels=128)

        self.conv_lstm = ConvLSTM(input_dim=128,
                                  hidden_dim=[128],
                                  kernel_size=(3, 3),
                                  num_layers=1,
                                  batch_first=True,
                                  bias=False,
                                  return_all_layers=False)

        self.gen = Generator(in_channels=[128], out_frames=self.out_frames)
Ejemplo n.º 5
0
        # generator
        model = FullNetwork(vp_value_count=VP_VALUE_COUNT,
                            stdev=STDEV,
                            output_shape=(BATCH_SIZE, CHANNELS, FRAMES, HEIGHT,
                                          WIDTH))
        if pretrained:
            model.load_state_dict(torch.load(pretrained_weights))
        model = model.to(device)

        if device == 'cuda':
            net = torch.nn.DataParallel(model)
            cudnn.benchmark = True

        criterion = nn.MSELoss()
        perceptual_loss = vgg16()
        optimizer = optim.Adam(model.parameters(), lr=LR)

        # data
        trainset = NTUDataset(root_dir=data_root_dir,
                              data_file=train_split,
                              param_file=param_file,
                              resize_height=HEIGHT,
                              resize_width=WIDTH,
                              clip_len=FRAMES,
                              skip_len=SKIP_LEN,
                              random_all=RANDOM_ALL,
                              precrop=PRECROP)
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=BATCH_SIZE,
                                                  shuffle=True,