Ejemplo n.º 1
0
 def __init__(self,
              rgb_size=RGB_FEATURES_SIZE,
              audio_size=AUDIO_FEATURES_SIZE,
              output_size=YOUTUBE8M_LABELS_N,
              inner_sizes=(2048, 2048)):
     super().__init__()
     self._impl = FCN(rgb_size + audio_size,
                      output_size,
                      inner_sizes,
                      dropout=0.2,
                      out_activation=nn.Sigmoid)
Ejemplo n.º 2
0
 def __init__(self,
              rgb_size=RGB_FEATURES_SIZE,
              audio_size=AUDIO_FEATURES_SIZE,
              output_size=YOUTUBE8M_LABELS_N,
              inner_size=2048,
              layers_number=2):
     super().__init__()
     self._layers = nn.ModuleList([
         nn.GRU(input_size=rgb_size + audio_size if i == 0 else inner_size,
                hidden_size=inner_size,
                num_layers=1,
                batch_first=True,
                bidirectional=False) for i in range(layers_number)
     ])
     self._out = FCN(inner_size * layers_number, output_size, (1524, 1524))
     self._num_layers = layers_number
     self._inner_size = inner_size
Ejemplo n.º 3
0
    def __init__(self,
                 rgb_size=RGB_FEATURES_SIZE,
                 audio_size=AUDIO_FEATURES_SIZE,
                 output_size=YOUTUBE8M_LABELS_N,
                 layers_number=3,
                 hidden_size=1024):
        super().__init__()

        self._linear_layers = nn.ModuleList([
            nn.Linear(rgb_size + audio_size + hidden_size * i, hidden_size)
            for i in range(layers_number)
        ])
        self._attention_layers = nn.ModuleList(
            [nn.Linear(hidden_size, 1) for i in range(layers_number)])

        self._bn = nn.BatchNorm1d(rgb_size + audio_size +
                                  hidden_size * layers_number)
        self._out = FCN(rgb_size + audio_size + hidden_size * layers_number,
                        output_size, (4096, 4096))
        self._num_layers = layers_number
Ejemplo n.º 4
0
    def __init__(self,
                 rgb_size=RGB_FEATURES_SIZE,
                 audio_size=AUDIO_FEATURES_SIZE,
                 output_size=YOUTUBE8M_LABELS_N,
                 layers_number=4,
                 hidden_size=768):
        super().__init__()

        self._linear_layers = nn.ModuleList([
            nn.Linear(rgb_size + audio_size + hidden_size * i, hidden_size)
            for i in range(layers_number)
        ])
        self._attention_layers = nn.ModuleList(
            [nn.Linear(hidden_size, 1) for i in range(layers_number)])

        self._dropout_layers = nn.ModuleList(
            [nn.Dropout((i + 1) / 10.) for i in range(layers_number)])
        self._out = FCN(rgb_size + audio_size + hidden_size * layers_number,
                        output_size, (4096, 4096),
                        dropout=0.3)
        self._num_layers = layers_number
Ejemplo n.º 5
0
    def __init__(self,
                 rgb_size=RGB_FEATURES_SIZE,
                 audio_size=AUDIO_FEATURES_SIZE,
                 output_size=YOUTUBE8M_LABELS_N,
                 rgb_inner_size=1024,
                 audio_inner_size=128,
                 layers_number=1):
        super().__init__()
        self._rgb_layers = nn.ModuleList([
            nn.LSTM(input_size=rgb_size + 2 * rgb_inner_size * i,
                    hidden_size=rgb_inner_size,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True) for i in range(layers_number)
        ])

        self._audio_layers = nn.ModuleList([
            nn.LSTM(input_size=audio_size + 2 * audio_inner_size * i,
                    hidden_size=audio_inner_size,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=True) for i in range(layers_number)
        ])

        self._first_linear_rgb = nn.Linear(rgb_size, rgb_size)
        self._first_linear_audio = nn.Linear(audio_size, audio_size)

        self._rgb_attention = nn.Linear(
            rgb_size + rgb_inner_size * 2 * layers_number, 1)
        self._audio_attention = nn.Linear(
            audio_size + audio_inner_size * 2 * layers_number, 1)

        self._bn = nn.BatchNorm1d(rgb_size +
                                  rgb_inner_size * 2 * layers_number +
                                  audio_size +
                                  audio_inner_size * 2 * layers_number)
        self._out = FCN(
            rgb_size + rgb_inner_size * 2 * layers_number + audio_size +
            audio_inner_size * 2 * layers_number, output_size, (4096, 4096))
        self._num_layers = layers_number