def create_transformed_images_row(row_number: int, number_of_image_tensors: int, number_of_channels: int, width: int, transformed_images_width, image_tensors_row, device): leading_zeros = row_number tailing_zeros = transformed_images_width - width - row_number if leading_zeros > 0: if Utils.use_cuda(): with torch.cuda.device(device): # creating the zeros directly on the gpu, which is faster # See: https://discuss.pytorch.org/t/creating-tensors-on-gpu-directly/2714/5 leading_zeros_tensor = torch.cuda.FloatTensor( number_of_image_tensors, number_of_channels, leading_zeros).fill_(0) else: leading_zeros_tensor = torch.zeros(number_of_image_tensors, number_of_channels, leading_zeros) # print("leading_zeros_tensor.size()" + str(leading_zeros_tensor.size())) #new_row = torch.cat((leading_zeros_tensor, # image_tensors[:, :, row_number, :]), 2) new_row = torch.cat((leading_zeros_tensor, image_tensors_row), 2) else: # new_row = image_tensors[:, :, row_number, :] new_row = image_tensors_row if tailing_zeros > 0: # print("number of channels: " + str(number_of_channels)) if Utils.use_cuda(): with torch.cuda.device(device): # creating the zeros directly on the gpu, which is faster # See: https://discuss.pytorch.org/t/creating-tensors-on-gpu-directly/2714/5 tailing_zeros_tensor = torch.\ cuda.FloatTensor(number_of_image_tensors, number_of_channels, tailing_zeros).fill_(0) else: tailing_zeros_tensor = torch.zeros(number_of_image_tensors, number_of_channels, tailing_zeros) # print("new_row.size(): " + str(new_row.size())) # print("tailing_zeros_tensor.size(): " + str(tailing_zeros_tensor.size())) new_row = torch.cat((new_row, tailing_zeros_tensor), 2) return new_row
def create_row_diagonal_offset_tensors_parallel_using_split(image_tensors): if Utils.use_cuda(): # https://discuss.pytorch.org/t/which-device-is-model-tensor-stored-on/4908/7 device = image_tensors.get_device() # See: https://stackoverflow.com/questions/46826218/pytorch-how-to-get-the-shape-of-a-tensor-as-a-list-of-int # print("list(image_tensor.size()): " + str(list(image_tensors.size()))) # See: https://discuss.pytorch.org/t/indexing-a-2d-tensor/1667/2 number_of_channels = image_tensors.size(1) # height = image_tensors.size(2) width = image_tensors.size(3) # print("height: " + str(height)) # print("width: " + str(width)) number_of_image_tensors = image_tensors.size(0) # print("number of image tensors: " + str(number_of_image_tensors)) # The width of the transformed images is width+height-1 (important for unequal sized input_ # transformed_images = torch.zeros(number_of_image_tensors, number_of_channels, height, (width + height) - 1) # print("transformed_image: " + str(transformed_image)) # print("transformed_im age.size(): " + str(transformed_image.size())) # The width of the transformed images is width+height-1 (important for unequal sized input_ transformed_images_width = ImageInputTransformer.get_skewed_images_width_four_dimensional_tensor( image_tensors) # In one go with split and cat on entire list list_for_cat = list([]) row_number = 0 for image_tensors_row in torch.split(image_tensors, 1, 2): # print("before - image_tensors_row.size(): " + str(image_tensors_row.size())) image_tensors_row = image_tensors_row.squeeze(2) # print("after - image_tensors_row.size(): " + str(image_tensors_row.size())) new_row = ImageInputTransformer. \ create_transformed_images_row(row_number, number_of_image_tensors, number_of_channels, width, transformed_images_width, image_tensors_row, device) # print("before - new_row.size(): " + str(new_row.size())) new_row = new_row.unsqueeze(2) # print("after - new_row.size(): " + str(new_row.size())) # print("new row.size(): " + str(new_row.size())) # print("transformed_image[:, :, y, :].size()" + str(transformed_images[:, :, y, :].size())) # transformed_images[:, :, y, :] = new_row # Use torch.cat instead of copying of a tensor slice into a zeros tensor. # torch.cat clearly preserves the backward gradient pointer, but with # copying to a zeros tensor it is not quite clear if this happens list_for_cat.append(new_row) row_number += 1 transformed_images = torch.cat(list_for_cat, 2) # print("create_row_diagonal_offset_tensor: transformed_images.grad_fn: " + str(transformed_images.grad_fn)) # print("transformed_images.size(): " + str(transformed_images.size())) return transformed_images
def dechunk_block_tensor_concatenated_along_batch_dimension_breaks_gradient( self, tensor: torch.tensor): number_of_examples = int( tensor.size(0) / self.number_of_feature_blocks_per_example) # print(">>> dechunk_block_tensor_concatenated_along_batch_dimension: - tensor.grad_fn " # + str(tensor.grad_fn)) # print("tensor.size(): " + str(tensor.size())) channels = tensor.size(1) tensor_grouped_by_block = tensor.view( self.number_of_feature_blocks_per_example, number_of_examples, channels, self.block_size.height, self.block_size.width) result = torch.zeros(number_of_examples, channels, self.original_size.height, self.original_size.width) # print("tensor.nelement(): " + str(tensor.nelement())) # print("resuls.nelement(): " + str(result.nelement())) if Utils.use_cuda(): # https://discuss.pytorch.org/t/which-device-is-model-tensor-stored-on/4908/7 device = tensor.get_device() result = result.to(device) # print("tensor_grouped_by_block.size(): " + str(tensor_grouped_by_block.size())) for block_index in range(0, tensor_grouped_by_block.size(0)): # print("i: " + str(block_index)) height_span_begin, height_span_end = self.height_span(block_index) width_span_begin, width_span_end = self.width_span(block_index) # print("height_span: " + str(height_span_begin) + ":" + str(height_span_end)) # print("width_span: " + str(width_span_begin) + ":" + str(width_span_end)) # print("tensor_grouped_by_block[block_index, :, :, :]:" + str( # tensor_grouped_by_block[block_index, :, :, :])) # Fixme: possibly copying like this destroys the gradient, as the grad_fn function of result # shows" result.grad_fn <CopySlices object at 0x7f211cbfa208> # instead of something like "<TanhBackward object" , "<CatBackward object"... # Probably "cat" should be used to reconstruct the original configuration # row by row. This was used previously also in the "extract_unskewed_activations" # function result[:, :, height_span_begin:height_span_end, width_span_begin:width_span_end] = \ tensor_grouped_by_block[block_index, :, :, :] # print(">>> dechunk_block_tensor_concatenated_along_batch_dimension: - result.grad_fn " # + str(result.grad_fn)) return result
def get_shifted_column(previous_state_column, hidden_states_size: int): previous_memory_state_column_shifted = previous_state_column.clone() height = previous_state_column.size(2) zeros_padding = Variable(torch.zeros(previous_state_column.size(0), hidden_states_size, 1)) if Utils.use_cuda(): zeros_padding = zeros_padding.cuda() skip_first_sub_tensor = previous_memory_state_column_shifted[:, :, 0:(height - 1)] # print("zeros padding" + str(zeros_padding)) # print("skip_first_sub_tensor: " + str(skip_first_sub_tensor)) previous_memory_state_column_shifted = torch. \ cat((zeros_padding, skip_first_sub_tensor), 2) # print("Returning previous_memory_state_column_shifted: " + str(previous_memory_state_column_shifted)) return previous_memory_state_column_shifted
def compute_ctc_loss_version_two(self, probabilities, labels_row_tensor): ctc_loss = warpctc_pytorch.CTCLoss() #probs = torch.FloatTensor([ # [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]], # [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]], # [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]] #]) probs = probabilities print( "test_ctc_loss_probabilities_match_labels_third_baidu_example - probs: " + str(probs)) print( "test_ctc_loss_probabilities_match_labels_third_baidu_example - probs.size(): " + str(probs.size())) # labels = Variable(torch.IntTensor([ [1, 0], [3, 3], [2, 3]])) # See: https://github.com/SeanNaren/warp-ctc/issues/29 # All label sequences are concatenated, without blanks/padding, # and label sizes lists the sizes without padding labels = Variable(torch.IntTensor([1, 3, 3, 2, 3])) # labels = Variable(torch.IntTensor([2, 3])) # labels = Variable(torch.IntTensor([3, 3])) # Labels sizes should be equal to number of labels label_sizes = Variable(torch.IntTensor([1, 2, 2])) # label_sizes = Variable(torch.IntTensor([2])) # This one must be equal to the number of probabilities to avoid a crash probs_sizes = Variable(torch.IntTensor([1, 3, 3])) # probs_sizes = Variable(torch.IntTensor([3])) probs = Variable(probs, requires_grad=True ) # tells autograd to compute gradients for probs print("probs: " + str(probs)) if Utils.use_cuda(): probs = probs.cuda() device = probs.get_device() ctc_loss = ctc_loss.cuda() # labels = labels.cuda() # label_sizes = label_sizes.cuda() # probs_sizes = probs_sizes.cuda() loss = ctc_loss(probs, labels, probs_sizes, label_sizes) print("loss: " + str(loss)) return loss
def check_inputs_is_right_type(inputs, input_is_list: bool): if Utils.use_cuda(): expected_type_instance = torch.cuda.ByteTensor() else: expected_type_instance = torch.ByteTensor() # If inputs is a list, check the first element of the list if input_is_list: item_to_compare = inputs[0] else: item_to_compare = inputs if item_to_compare.type() != expected_type_instance.type(): raise RuntimeError("Error: expected a " + str(expected_type_instance.type()) + " type image tensor" + " but got : " + str(item_to_compare.type()))
def test_tensor_list_block_chunking_followed_by_dechunking_reconstructs_original( tensor_one, tensor_two, block_size, tensors_all_have_same_height: bool): if Utils.use_cuda(): tensor_one = tensor_one.cuda() tensor_two = tensor_two.cuda() print("tensor_one: " + str(tensor_one)) print("tensor_two: " + str(tensor_two)) #print("tensor_one[0, :, :]: " + str(tensor_one[0, :, :])) #print("tensor_two[0, :, :]: " + str(tensor_two[0, :, :])) tensor_list = list([tensor_one, tensor_two]) tensor_chunking = TensorListChunking.create_tensor_list_chunking( tensor_list, block_size) chunking = tensor_chunking.\ chunk_tensor_list_into_blocks_concatenate_along_batch_dimension(tensor_list, tensors_all_have_same_height) print("chunking: " + str(chunking)) print("chunking.size(): " + str(chunking.size())) dechunked_tensor_list = tensor_chunking.\ dechunk_block_tensor_concatenated_along_batch_dimension_changed_block_size(chunking, block_size) print("dechunked_tensor_list: " + str(dechunked_tensor_list)) # https://stackoverflow.com/questions/32996281/how-to-check-if-two-torch-tensors-or-matrices-are-equal # https://discuss.pytorch.org/t/tensor-math-logical-operations-any-and-all-functions/6624 for tensor_original, tensor_reconstructed in zip(tensor_list, dechunked_tensor_list): tensors_are_equal = torch.eq(tensor_original, tensor_reconstructed).all() print("tensors_are_equal: " + str(tensors_are_equal)) if not tensors_are_equal: raise RuntimeError("Error: original tensor " + str(tensor_original) + " and dechunked tensor " + str(tensor_reconstructed) + " are not equal") else: print( "Success: original tensor and dechunked(chunked(tensor)) are equal" )
def create_skewed_images_variable_four_dim(x): # skewed_images = ImageInputTransformer.create_row_diagonal_offset_tensors(x) ### Not clear if this method really causes the gradient to break or not. # skewed_images = ImageInputTransformer.\ # create_row_diagonal_offset_tensors_parallel_breaks_gradient(x) skewed_images = ImageInputTransformer. \ create_row_diagonal_offset_tensors(x) # print("skewed images columns: " + str(skewed_images_columns)) # print("skewed images rows: " + str(skewed_images_rows)) # print("skewed_images: " + str(skewed_images)) # See: https://pytorch.org/docs/stable/tensors.html if Utils.use_cuda(): # https://discuss.pytorch.org/t/which-device-is-model-tensor-stored-on/4908/7 device = x.get_device() skewed_images = skewed_images.to(device) return skewed_images
def chunk_tensor_into_blocks_concatenate_along_batch_dimension_no_cat( self, tensor: torch.tensor): tensor_split_on_height = torch.split(tensor, self.block_size.height, 2) # New implementation: completely without use of cat # https://discuss.pytorch.org/t/best-way-to-split-process-merge/18702 total_blocks = self.blocks_per_column * self.blocks_per_row batch_size = tensor.size(0) # The height in the batch dimension must be such that it fits all stacked # blocks, i.e. stacked in a single column, and also keeping the batch dimension height_in_batch_dimension = total_blocks * batch_size print("height in batch dimension: " + str(height_in_batch_dimension)) if Utils.use_cuda(): device = tensor.get_device() with torch.cuda.device(device): # creating the zeros directly on the gpu, which is faster # See: https://discuss.pytorch.org/t/creating-tensors-on-gpu-directly/2714/5 result = torch.cuda.FloatTensor(height_in_batch_dimension, tensor.size(1), self.block_size.height, self.block_size.width).fill_(0) else: result = torch.FloatTensor(height_in_batch_dimension, tensor.size(1), self.block_size.height, self.block_size.width).fill_(0) index = 0 for row_block in tensor_split_on_height: blocks = torch.split(row_block, self.block_size.width, 3) for column_block in blocks: # print("column_block.size(): " + str(column_block.size())) # print("result.size(): " + str(result.size())) # print("result slice.size() : " + # str(result[index * batch_size:((index + 1) * batch_size), # :, :, :].size()) # ) # https://discuss.pytorch.org/t/best-way-to-split-process-merge/18702 result[index * batch_size:((index + 1) * batch_size), :, :, :] = column_block return result
def test_tensor_block_chunking_followed_by_dechunking_reconstructs_original(): tensor = torch.Tensor([range(1, 97)]).view(2, 2, 4, 6) if Utils.use_cuda(): tensor = tensor.cuda() print(tensor) print("tensor[0, 0, :, :]: " + str(tensor[0, 0, :, :])) # chunking = chunk_tensor_into_blocks_return_as_list( # tensor, SizeTwoDimensional.create_size_two_dimensional(2, 2)) # print("chunking: " + str(chunking)) # for item in chunking: # print("item.size(): " + str(item.size())) original_size = SizeTwoDimensional.create_size_two_dimensional(4, 6) block_size = SizeTwoDimensional.create_size_two_dimensional(2, 2) tensor_chunking = TensorChunking.create_tensor_chunking( original_size, block_size) chunking = tensor_chunking.chunk_tensor_into_blocks_concatenate_along_batch_dimension( tensor) print("chunking: " + str(chunking)) print("chunking.size(): " + str(chunking.size())) dechunked_tensor = tensor_chunking.dechunk_block_tensor_concatenated_along_batch_dimension( chunking) print("dechunked_tensor: " + str(dechunked_tensor)) # https://stackoverflow.com/questions/32996281/how-to-check-if-two-torch-tensors-or-matrices-are-equal # https://discuss.pytorch.org/t/tensor-math-logical-operations-any-and-all-functions/6624 tensors_are_equal = torch.eq(tensor, dechunked_tensor).all() print("tensors_are_equal: " + str(tensors_are_equal)) if not tensors_are_equal: raise RuntimeError("Error: original tensor " + str(tensor) + " and dechunked tensor " + str(dechunked_tensor) + " are not equal") else: print( "Success: original tensor and dechunked(chunked(tensor)) are equal" )
def create_row_diagonal_offset_tensors_parallel_breaks_gradient( image_tensors): if Utils.use_cuda(): # https://discuss.pytorch.org/t/which-device-is-model-tensor-stored-on/4908/7 device = image_tensors.get_device() number_of_channels = image_tensors.size(1) height = image_tensors.size(2) width = image_tensors.size(3) number_of_image_tensors = image_tensors.size(0) transformed_images = torch.zeros( number_of_image_tensors, number_of_channels, height, ImageInputTransformer. get_skewed_images_width_four_dimensional_tensor(image_tensors)) for y in range(image_tensors.size(2)): leading_zeros = y tailing_zeros = transformed_images.size(3) - width - y if leading_zeros > 0: # To get a sub-tensor with everything from the 0th and 3th dimension, # and specific values for the 1th and 2nd dimension you use # image_tensors[:, 0, y, :] # See: # https://stackoverflow.com/questions/47374172/how-to-select-index-over-two-dimension-in-pytorch?rq=1 leading_zeros_tensor = torch.zeros(number_of_image_tensors, number_of_channels, leading_zeros) if Utils.use_cuda(): leading_zeros_tensor = leading_zeros_tensor.to(device) # print("leading_zeros_tensor.size()" + str(leading_zeros_tensor.size())) new_row = torch.cat( (leading_zeros_tensor, image_tensors[:, :, y, :]), 2) else: new_row = image_tensors[:, :, y, :] if tailing_zeros > 0: # print("number of channels: " + str(number_of_channels)) tailing_zeros_tensor = torch.zeros(number_of_image_tensors, number_of_channels, tailing_zeros) if Utils.use_cuda(): tailing_zeros_tensor = tailing_zeros_tensor.to(device) # print("new_row.size(): " + str(new_row.size())) # print("tailing_zeros_tensor.size(): " + str(tailing_zeros_tensor.size())) new_row = torch.cat((new_row, tailing_zeros_tensor), 2) # print("new row.size(): " + str(new_row.size())) # print("transformed_image[:, :, y, :].size()" + str(transformed_images[:, :, y, :].size())) transformed_images[:, :, y, :] = new_row # This method creates CopySlices objects as gradients. Not clear if this is ok. # It may be harmless, but seems to be slower in any case # Something can be found about CopySlices at # https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd/functions/tensor.cpp # but this is not also very conclusive print( "create_row_diagonal_offset_tensor_parallel_breaks_gradient: transformed_images.grad_fn: " + str(transformed_images.grad_fn)) # print("transformed_images.size(): " + str(transformed_images.size())) return transformed_images
def train_mdrnn(train_loader, test_loader, input_channels: int, input_size: SizeTwoDimensional, hidden_states_size: int, batch_size, compute_multi_directional: bool, use_dropout: bool): import torch.optim as optim criterion = nn.CrossEntropyLoss() #multi_dimensional_rnn = MultiDimensionalRNN.create_multi_dimensional_rnn(hidden_states_size, # batch_size, # compute_multi_directional, # nonlinearity="sigmoid") #multi_dimensional_rnn = MultiDimensionalRNNFast.create_multi_dimensional_rnn_fast(hidden_states_size, # batch_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") #multi_dimensional_rnn = MultiDimensionalLSTM.create_multi_dimensional_lstm(hidden_states_size, # batch_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") # http://pytorch.org/docs/master/notes/cuda.html device = torch.device("cuda:0") # device_ids should include device! # device_ids lists all the gpus that may be used for parallelization # device is the initial device the model will be put on #device_ids = [0, 1] device_ids = [0] # multi_dimensional_rnn = MultiDimensionalLSTM.create_multi_dimensional_lstm_fast(input_channels, # hidden_states_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTM.create_block_multi_dimensional_lstm(input_channels, # hidden_states_size, # mdlstm_block_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") # # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # output_channels = mdlstm_block_size.width * mdlstm_block_size.height * hidden_states_size # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPair.\ # create_block_multi_dimensional_lstm_layer_pair(input_channels, hidden_states_size, # output_channels, mdlstm_block_size, # block_strided_convolution_block_size, # compute_multi_directional, # use_dropout, # nonlinearity="tanh") # # An intermediate test case with first a layer-pair that consists of a # # BlockMultiDimensionalLSTM layer, followed by a BlockStructuredConvolution layer. # # After this comes an additional single block_strided_convolution layer as # # opposed to another full layer pair # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPairStacking.\ # create_one_layer_pair_plus_second_block_convolution_layer_network(hidden_states_size, mdlstm_block_size, # block_strided_convolution_block_size) # # An intermediate test case with first a layer-pair that consists of a # # BlockMultiDimensionalLSTM layer, followed by a BlockStructuredConvolution layer. # # After this comes an additional single mdlstm layer as # # opposed to another full layer pair # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPairStacking.\ # create_one_layer_pair_plus_second_block_mdlstm_layer_network(hidden_states_size, mdlstm_block_size, # block_strided_convolution_block_size) # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 2) block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional( 4, 2) multi_dimensional_rnn = MultiDimensionalLSTMLayerPairStacking.\ create_two_layer_pair_network(hidden_states_size, mdlstm_block_size, block_strided_convolution_block_size, False) network = MultiDimensionalRNNToSingleClassNetwork.\ create_multi_dimensional_rnn_to_single_class_network(multi_dimensional_rnn, input_size) #multi_dimensional_rnn = Net() if Utils.use_cuda(): #multi_dimensional_rnn = multi_dimensional_rnn.cuda() network = nn.DataParallel(network, device_ids=device_ids) network.to(device) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters.parallel_memory_state_column_computation :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters.parallel_memory_state_column_computation)) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters." # "parallel_memory_state_column_computation.parallel_convolution.bias :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters. # parallel_memory_state_column_computation.parallel_convolution.bias)) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters." # "parallel_hidden_state_column_computation.parallel_convolution.bias :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters. # parallel_hidden_state_column_computation.parallel_convolution.bias)) print_number_of_parameters(multi_dimensional_rnn) #optimizer = optim.SGD(multi_dimensional_rnn.parameters(), lr=0.001, momentum=0.9) # Adding some weight decay seems to do magic, see: http://pytorch.org/docs/master/optim.html optimizer = optim.SGD(network.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5) # Faster learning #optimizer = optim.SGD(multi_dimensional_rnn.parameters(), lr=0.01, momentum=0.9) start = time.time() num_gradient_corrections = 0 for epoch in range(4): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(train_loader, 0): # get the inputs inputs, labels = data if Utils.use_cuda(): inputs = inputs.to(device) # Set requires_grad(True) directly and only for the input inputs.requires_grad_(True) # wrap them in Variable # labels = Variable(labels) # Labels need no gradient apparently if Utils.use_cuda(): labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() #print("inputs: " + str(inputs)) # forward + backward + optimize #outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) time_start_network_forward = time.time() outputs = network(inputs) # print("Time used for network forward: " + str(util.timing.time_since(time_start_network_forward))) # print("outputs: " + str(outputs)) # print("outputs.size(): " + str(outputs.size())) #print("labels: " + str(labels)) time_start_loss_computation = time.time() loss = criterion(outputs, labels) # print("Time used for loss computation: " + str(util.timing.time_since(time_start_loss_computation))) time_start_loss_backward = time.time() get_dot = modules.find_bad_gradients.register_hooks(outputs) loss.backward() dot = get_dot() dot.save('mdlstm_find_bad_gradients.dot') render('dot', 'png', 'mdlstm_find_bad_gradients.dot') raise RuntimeError("stopping after find bad gradients") # print("Time used for loss backward: " + str(util.timing.time_since(time_start_loss_backward))) # Perform gradient clipping made_gradient_norm_based_correction = clip_gradient( multi_dimensional_rnn) if made_gradient_norm_based_correction: num_gradient_corrections += 1 optimizer.step() # print statistics # print("loss.data: " + str(loss.data)) # print("loss.data[0]: " + str(loss.data[0])) running_loss += loss.data #if i % 2000 == 1999: # print every 2000 mini-batches # See: https://stackoverflow.com/questions/5598181/python-multiple-prints-on-the-same-line #print(str(i)+",", end="", flush=True) if i % 100 == 99: # print every 100 mini-batches end = time.time() running_time = end - start print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100) + " Running time: " + str(running_time)) print("Number of gradient norm-based corrections: " + str(num_gradient_corrections)) running_loss = 0.0 num_gradient_corrections = 0 print('Finished Training') # Run evaluation # multi_dimensional_rnn.set_training(False) # Normal case network.module.set_training(False) # When using DataParallel evaluate_mdrnn(test_loader, network, batch_size, device)
def compute_ctc_loss(self, probabilities, labels_row_tensor, batch_size: int, width_reduction_factor: int): WarpCTCLossInterface.check_labels_row_tensor_contains_no_zeros( labels_row_tensor) labels = Variable( WarpCTCLossInterface. create_one_dimensional_labels_tensor_removing_padding_labels( labels_row_tensor)) # label_sizes = Variable(WarpCTCLossInterface.\ # create_sequence_lengths_specification_tensor_all_same_length(labels_row_tensor)) label_sizes = Variable(WarpCTCLossInterface.\ create_sequence_lengths_specification_tensor_different_lengths(labels_row_tensor)) # probabilities_sizes = Variable(WarpCTCLossInterface.\ # create_probabilities_lengths_specification_tensor_all_same_length(probabilities)) # print("labels sizes: " + str(label_sizes)) probabilities_sizes = \ Variable(WarpCTCLossInterface. create_probabilities_lengths_specification_tensor_different_lengths( labels_row_tensor, width_reduction_factor, probabilities)) # The ctc_loss interface expects the second dimension to be the batch size, # so the first and second dimension must be swapped probabilities_batch_second_dimension = probabilities.transpose( 0, 1).contiguous() if Utils.use_cuda(): device = probabilities.get_device() self.ctc_loss = self.ctc_loss.to(device) # self.ctc_loss = self.ctc_loss.cuda() # https://discuss.pytorch.org/t/which-device-is-model-tensor-stored-on/4908/7 # device = probabilities.get_device() # Causes "Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)" # labels = labels.cuda() #probabilities_batch_second_dimension = torch.zeros(probabilities_batch_second_dimension.size(0), # probabilities_batch_second_dimension.size(1), # probabilities_batch_second_dimension.size(2), # requires_grad=True # ) probabilities_batch_second_dimension = probabilities_batch_second_dimension.cuda( ) # probabilities_batch_second_dimension = probabilities_batch_second_dimension.to(device) # print("probabilities_batch_second_dimension.requires_grad:" + # str(probabilities_batch_second_dimension.requires_grad)) #probabilities_sizes = Variable(torch.IntTensor([1, 3, 3])) #probabilities_sizes = probabilities_sizes.to(device) # print("probabilities_batch_second_dimension: " + str(probabilities_batch_second_dimension)) # print("probabilities_sizes: " + str(probabilities_sizes)) # print(">>> compute_ctc_loss - probabilities_batch_second_dimension.size(): " # + str(probabilities_batch_second_dimension.size())) # print(">>> compute_ctc_loss - labels.size(): " + str(labels.size())) # print(">>> compute_ctc_loss - label_sizes.size(): " + str(label_sizes.size())) # print(">>> compute_ctc_loss - probabilities_sizes.size(): " + str(probabilities_sizes.size())) # print("label_sizes: " + str(label_sizes)) # print("labels: " + str(labels)) # print("probabilities_sizes: " + str(probabilities_sizes)) # Sanity check: the batch size must be the right dimension of the probabilities # tensor, otherwise the ctc_loss function will give wrong results and or # crash. if probabilities_batch_second_dimension.size(1) != batch_size: raise RuntimeError( "Error: the second dimension of probabilities_batch_second_dimension " + "should equal batch_size " + str(batch_size) + " but is " + str(probabilities_batch_second_dimension.size(1))) # print("compute_ctc_loss - probabilities_sizes: " + str(probabilities_sizes)) # print("compute_ctc_loss - labels: " + str(labels)) # print("compute_ctc_loss - label_sizes: " + str(label_sizes)) loss = self.ctc_loss(probabilities_batch_second_dimension, labels, probabilities_sizes, label_sizes) # print(">>> compute_ctc_loss - loss: " + str(loss)) return loss
def evaluate_mdrnn(test_loader, multi_dimensional_rnn, device, vocab_list: list, blank_symbol: str, horizontal_reduction_factor: int, image_input_is_unsigned_int: bool, input_is_list: bool, language_model_parameters: LanguageModelParameters, save_score_table_file_path: str, epoch_number: int, epoch_statistics: EpochStatistics): correct = 0 total = 0 output_strings = list([]) reference_labels_strings = list([]) for data in test_loader: inputs, labels = data if Utils.use_cuda(): labels = labels.to(device) if input_is_list: inputs = Utils.move_tensor_list_to_device(inputs, device) else: inputs = inputs.to(device) # If the image input comes in the form of unsigned ints, they need to # be converted to floats (after moving to GPU, i.e. directly on GPU # which is faster) if image_input_is_unsigned_int: Trainer.check_inputs_is_right_type(inputs, input_is_list) inputs = IamLinesDataset.convert_unsigned_int_image_tensor_or_list_to_float_image_tensor_or_list(inputs) # https://github.com/pytorch/pytorch/issues/235 # Running the evaluation without computing gradients is the recommended way # since this saves time, and more importantly, memory with torch.no_grad(): # outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) max_input_width = NetworkToSoftMaxNetwork.get_max_input_width(inputs) outputs = multi_dimensional_rnn(inputs, max_input_width) probabilities_sum_to_one_dimension = 2 # Outputs is the output of the linear layer which is the input to warp_ctc # But to get probabilities for the decoder, the softmax function needs to # be applied to the outputs probabilities = torch.nn.functional. \ softmax(outputs, probabilities_sum_to_one_dimension) # No longer necessary with fixed word separator specification in decoder # and normal language model # probabilities = Evaluator.append_preceding_word_separator_to_probabilities( # probabilities, vocab_list, Evaluator.WORD_SEPARATOR_SYMBOL) print(">>> evaluate_mdrnn - outputs.size: " + str(outputs.size())) print(">>> evaluate_mdrnn - probabilities.size: " + str(probabilities.size())) # beam_size = 20 # This is the problem perhaps... # beam_size = 100 # The normal default is 100 beam_size = Evaluator.BEAM_SIZE # Larger value to see if it further improves results # This value specifies the number of (character) probabilities kept in the # decoder. If it is set equal or larger to the number of characters in the # vocabulary, no pruning is done for it cutoff_top_n = len(vocab_list) # No pruning for this parameter print(">>> evaluate_mdrnn - len(vocab_list): " + str(len(vocab_list))) decoder = Evaluator.create_decoder(vocab_list, cutoff_top_n, beam_size, blank_symbol, language_model_parameters) label_sizes = WarpCTCLossInterface. \ create_sequence_lengths_specification_tensor_different_lengths(labels) sequence_lengths = WarpCTCLossInterface.\ create_probabilities_lengths_specification_tensor_different_lengths( labels, horizontal_reduction_factor, probabilities) sequence_lengths = Evaluator.increase_sequence_lengths_by_one(sequence_lengths) # print(">>> evaluate_mdrnn - sequence lengths: " + str(sequence_lengths)) # print("probabilities.data.size(): " + str(probabilities.data.size())) beam_results, beam_scores, timesteps, out_seq_len = \ decoder.decode(probabilities.data, sequence_lengths) # print(">>> evaluate_mdrnn - beam_results: " + str(beam_results)) total += labels.size(0) for example_index in range(0, beam_results.size(0)): beam_results_sequence = beam_results[example_index][0] # print("beam_results_sequence: \"" + str(beam_results_sequence) + "\"") use_language_model_in_decoder = language_model_parameters is not None output_string = Evaluator.convert_to_string( beam_results_sequence, vocab_list, out_seq_len[example_index][0], use_language_model_in_decoder) example_labels_with_padding = labels[example_index] # Extract the real example labels, removing the padding labels reference_labels = example_labels_with_padding[0:label_sizes[example_index]] # print(">>> evaluate_mdrnn - reference_labels: " + str(reference_labels)) reference_labels_string = Evaluator.convert_labels_tensor_to_string( reference_labels, vocab_list, blank_symbol) if reference_labels_string == output_string: # print("Yaaaaah, got one correct!!!") correct += 1 correct_string = "correct" else: correct_string = "wrong" print(">>> evaluate_mdrnn - output: \"" + output_string + "\" " + "\nreference: \"" + reference_labels_string + "\" --- " + correct_string) output_strings.append(output_string) reference_labels_strings.append(reference_labels_string) # correct += (predicted == labels).sum() cer_including_word_separators = evaluation_metrics.character_error_rate. \ compute_character_error_rate_for_list_of_output_reference_pairs_fast( output_strings, reference_labels_strings, True) cer_excluding_word_separators = evaluation_metrics.character_error_rate. \ compute_character_error_rate_for_list_of_output_reference_pairs_fast( output_strings, reference_labels_strings, False) wer = evaluation_metrics.word_error_rate. \ compute_word_error_rate_for_list_of_output_reference_pairs( output_strings, reference_labels_strings) total_examples = len(test_loader.dataset) validation_stats = ValidationStats(total_examples, correct, cer_excluding_word_separators, wer) # https://stackoverflow.com/questions/3395138/using-multiple-arguments-for-string-formatting-in-python-e-g-s-s print("Accuracy of the network on the {} test inputs: {:.2f} % accuracy".format( total_examples, validation_stats.get_accuracy())) print("Character Error Rate (CER)[%] of the network on the {} test inputs, " "including word separators: {:.3f} CER".format( total_examples, cer_including_word_separators)) print("Character Error Rate (CER)[%] of the network on the {} test inputs, " "excluding word separators: {:.3f} CER".format( total_examples, cer_excluding_word_separators)) print("Word Error Rate (WER)[%] of the network on the {} test inputs: {:.3f} WER".format( total_examples, wer)) if save_score_table_file_path is not None: score_file_existed = os.path.exists(save_score_table_file_path) # Opens the file in append-mode, create if it doesn't exists with open(save_score_table_file_path, "a") as scores_table_file: if not score_file_existed: scores_table_file.write(Evaluator.score_table_header(total_examples, epoch_statistics)) scores_table_file.write(Evaluator.score_table_line(epoch_number, correct, validation_stats.get_accuracy(), cer_including_word_separators, cer_excluding_word_separators, wer, epoch_statistics) + "\n") return validation_stats
def train_one_epoch(self, train_loader, epoch: int, start: int, batch_size, device, inputs_is_list: bool, report_func=None): """ Train next epoch. Args: train_iter: training data iterator epoch(int): the epoch number report_func(fn): function for logging train_loader: the train loader, start: time in seconds training started return: Average loss per minibatch, total_examples """ # if isinstance(self.model, torch.nn.DataParallel): # device = self.model.module.get_device() # else: # device = self.model.get_device() num_gradient_corrections = 0 gradient_norms_sum = 0 running_loss = 0.0 total_summed_loss_epoch = 0.0 total_examples = 0 number_of_minibatches = 0 time_start = time.time() for i, data in enumerate(train_loader, 0): time_start_batch = time.time() # get the inputs inputs, labels = data # This one might expect to make things faster, but it doesn't seems # to help yet # inputs = TensorUtils.get_pinned_memory_copy_of_list(inputs) Trainer.check_there_are_no_zero_labels(labels, inputs_is_list) # If minimize_horizontal_padding is used, inputs will be a list if Utils.use_cuda(): if not inputs_is_list: inputs = inputs.to(device) else: inputs = Utils.move_tensor_list_to_device(inputs, device) # If the image input comes in the form of unsigned ints, they need to # be converted to floats (after moving to GPU, i.e. directly on GPU # which is faster) if self.model_properties.image_input_is_unsigned_int: Trainer.check_inputs_is_right_type(inputs, inputs_is_list) inputs = IamLinesDataset.convert_unsigned_int_image_tensor_or_list_to_float_image_tensor_or_list( inputs) if inputs_is_list: for element in inputs: element.requires_grad_(True) else: # Set requires_grad(True) directly and only for the input inputs.requires_grad_(True) # wrap them in Variable # labels = Variable(labels) # Labels need no gradient apparently # if Utils.use_cuda(): # Labels must remain on CPU for warp-ctc loss # labels = labels.to(device) # print("inputs: " + str(inputs)) # forward + backward + optimize # outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - labels.size(): " + str(labels.size())) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - inputs.size(): " + str(inputs.size())) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - inputs: " + str(inputs)) time_start_network_forward = util.timing.date_time_now() max_input_width = NetworkToSoftMaxNetwork.get_max_input_width( inputs) outputs = self.model(inputs, max_input_width) # print("Time used for network forward: " + str(util.timing.milliseconds_since(time_start_network_forward))) # print(">>> outputs.size(): " + str(outputs.size())) # print(">>> labels.size() : " + str(labels.size())) # print("labels: " + str(labels)) # warp_ctc_loss_interface. # print(">>> labels_one_dimensional.size() : " + str(labels_one_dimensional.size())) # print("labels_one_dimensional: " + str(labels_one_dimensional)) # print("outputs: " + str(outputs)) # print("outputs.size(): " + str(outputs.size())) # print("labels: " + str(labels)) if inputs_is_list: number_of_examples = len(inputs) else: number_of_examples = inputs.size(0) time_start_ctc_loss_computation = util.timing.date_time_now() # print("trainer - outputs.size(): " + str(outputs.size())) loss = self.warp_ctc_loss_interface.compute_ctc_loss( outputs, labels, number_of_examples, self.model_properties.width_reduction_factor) total_examples += number_of_examples # print("Time used for ctc loss computation: " + # str(util.timing.milliseconds_since(time_start_ctc_loss_computation))) # See: https://github.com/SeanNaren/deepspeech.pytorch/blob/master/train.py # The averaging seems to help learning (but a smaller learning rate # might have the same effect!) loss = loss / number_of_examples # average the loss by minibatch size loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.item() # print("loss: " + str(loss)) # loss = criterion(outputs, labels) time_start_loss_backward = util.timing.date_time_now() # zero the parameter gradients self.optimizer.zero_grad() self.model.zero_grad() # get_dot = modules.find_bad_gradients.register_hooks(outputs) loss = loss.contiguous() loss.backward() # https://discuss.pytorch.org/t/how-to-check-for-vanishing-exploding-gradients/9019/4 #for p, n in zip(self.model.parameters(), self.model._all_weights[0]): # if n[:6] == 'weight': # print('===========\ngradient:{}\n----------\n{}'.format(n, p.grad)) # for name, p in self.model.named_parameters(): # print('===========\ngradient {} \n----------\n{}'.format(name, p.grad)) # dot = get_dot() # dot.save('mdlstm_ctc_no_data_parallel_find_bad_gradients-clamp-pad-function.dot') # render('dot', 'png', 'mdlstm_ctc_mnist_find_bad_gradients.dot') # print("Time used for loss backward: " + str(util.timing.milliseconds_since(time_start_loss_backward))) # raise RuntimeError("stopping after find bad gradients") # Perform step including gradient clipping # made_gradient_norm_based_correction, total_norm = self.optimizer.step() # Perform an update step, including norm-based gradient clipping. Compensate the maximum gradient # norm by the factor: number_of_examples/batch_size. This is to avoid over-correction (too much learning) # for the last batch, which contains less examples. made_gradient_norm_based_correction, total_norm = self.optimizer.step_with_scaling_for_size_current_batch( number_of_examples, batch_size) print("trainer - total norm: " + str(total_norm)) if made_gradient_norm_based_correction: num_gradient_corrections += 1 gradient_norms_sum += total_norm # print statistics # print("loss.data: " + str(loss.data)) # print("loss.data[0]: " + str(loss.data[0])) running_loss += loss_value total_summed_loss_epoch += loss_value # if i % 2000 == 1999: # print every 2000 mini-batches # See: https://stackoverflow.com/questions/5598181/python-multiple-prints-on-the-same-line # print(str(i)+",", end="", flush=True) if i % 10 == 9: # print every 10 mini-batches end = time.time() running_time = end - start print('[%d, %5d] loss: %.3f' % (epoch, i + 1, running_loss / 10) + " Running time: " + str(running_time)) average_norm = gradient_norms_sum / 10 print("Number of gradient norm-based corrections: " + str(num_gradient_corrections)) print("Average gradient total norm: " + str(average_norm)) running_loss = 0.0 num_gradient_corrections = 0 gradient_norms_sum = 0 percent = (i + 1) / float(len(train_loader)) examples_processed = (i + 1) * batch_size total_examples = len(train_loader.dataset) print("Processed " + str(examples_processed) + " of " + str(total_examples) + " examples in this epoch") print(">>> Time used in current epoch: " + str( util.timing.time_since_and_expected_remaining_time( time_start, percent))) sys.stdout.flush() number_of_minibatches += 1 average_loss_per_minibatch = total_summed_loss_epoch / number_of_minibatches return average_loss_per_minibatch, total_examples