def forward(self, x): x = torch.unsqueeze(x, dim=1) patches = slide_window(x, window_width, window_stride) B, C, H, Window_W, T = patches.shape patches = patches.permute((4, 0, 1, 2, 3)) # PyTorch's way of TimeDistributed: merge dims T and B conv_o1 = self.conv1(patches.contiguous().view( T * B, C, H, Window_W)) # (T*B, C, H/2-2, W/2-2) conv_out = self.conv2(conv_o1.view(T * B, -1)).view(T, B, 128) lstm_out, (h_n, c_n) = self.lstm(conv_out) # lstm_out: (T, B, 128) out_linear = self.linear(lstm_out) # nn.Linear() allows 3D tensor logsoftmax = nn.functional.log_softmax( out_linear, dim=2 ) # logsoftmax should be in shape (T, B, classes) to be consistent with ctc_decode input_lengths = torch.Tensor([T] * B).long() return logsoftmax, input_lengths
def slide_window_bound(image, window_width=window_width, window_stride=window_stride): return slide_window(image, window_width, window_stride)