def gather_list(tensor_lists, destination=None): """Gathers tensor lists from multiple GPUs. Tensor sizes in all dimension different than ``dim`` have to match. Arguments: tensor_lists (Iterable[Tensor]): iterable of tensor lists to gather. destination (int, optional): output device (-1 means CPU, default: current device) Returns: A tensor list located on ``destination`` device, that is a result of appending``tensor lists``. """ total_size = 0 # expected_size = list(tensor_lists[0].size()) for tensor_list in tensor_lists: for element in tensor_list: assert element.is_cuda, "gather expects all inputs to be on GPUs" # expected_size[dim] = tensor_list.size(dim) # if list(tensor_list.size()) != expected_size: # got = 'x'.join(str(x) for x in tensor_list.size()) # expected = 'x'.join(str(x) for x in expected_size) # raise ValueError("gather got an input of invalid size: got {}, " # "but expected {}".format(got, expected)) #total_size += tensor_list.size(dim) total_size += len(tensor_list) # expected_size[dim] = total_size # expected_size = torch.Size(expected_size) result = list([]) for tensor_list in tensor_lists: result.extend(tensor_list) if destination is None: destination = torch.cuda.current_device() if destination == -1: # result = tensor_lists[0].new().cpu().resize_(expected_size) result = Utils.move_tensor_list_to_device(result, -1) else: result = Utils.move_tensor_list_to_device(result, destination) # chunk_start = 0 # # TODO: if copying to CPU, allocate a pinned buffer, do async copies to it, # # and copy it to regular memory # for tensor_list in tensor_lists: # result.narrow(dim, chunk_start, tensor_list.size(dim)).copy_(tensor_list, True) # chunk_start += tensor_list.size(dim) return result
def scatter_list(list_of_tensors, devices, chunk_sizes=None, streams=None): """Scatters tensor across multiple GPUs. Arguments: list_of_tensors (list(Tensor)): list of tensors to scatter. devices (Iterable[int]): iterable of ints, specifying among which devices the tensor should be scattered. chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on each device. It should match ``devices`` in length and sum to ``len(list_of_tensors)``. If not specified, the list of tensors will be divided into equal chunks. Returns: A tuple containing chunks of the ``list of tensors``, spread across given ``devices``. """ if chunk_sizes is None: chunks = chunk_list(list_of_tensors, len(devices)) else: assert sum(chunk_sizes) == len(list_of_tensors), "given chunk sizes " \ "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ "expected {})".format(sum(chunk_sizes), len(list_of_tensors)) assert min(chunk_sizes) > 0, "got a negative chunk_size" # chunks = [list_of_tensors.narrow(dim, start - size, size) # for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)] chunks = [ list_of_tensors[start:start + size] for start, size in zip(_accumulate(chunk_sizes), chunk_sizes) ] # chunks = tuple(chunk.contiguous() for chunk in chunks) # TODO: copy to a pinned buffer first (if copying from CPU) if streams is None: streams = [None] * len(devices) outputs = [] for device, chunk, stream in zip(devices, chunks, streams): with torch.cuda.device(device), torch.cuda.stream(stream): # outputs.append(chunk.cuda(device, non_blocking=True)) outputs.append( Utils.move_tensor_list_to_device(chunk, device, non_blocking=True)) return tuple(outputs)
def evaluate_mdrnn(test_loader, multi_dimensional_rnn, device, vocab_list: list, blank_symbol: str, horizontal_reduction_factor: int, image_input_is_unsigned_int: bool, input_is_list: bool, language_model_parameters: LanguageModelParameters, save_score_table_file_path: str, epoch_number: int, epoch_statistics: EpochStatistics): correct = 0 total = 0 output_strings = list([]) reference_labels_strings = list([]) for data in test_loader: inputs, labels = data if Utils.use_cuda(): labels = labels.to(device) if input_is_list: inputs = Utils.move_tensor_list_to_device(inputs, device) else: inputs = inputs.to(device) # If the image input comes in the form of unsigned ints, they need to # be converted to floats (after moving to GPU, i.e. directly on GPU # which is faster) if image_input_is_unsigned_int: Trainer.check_inputs_is_right_type(inputs, input_is_list) inputs = IamLinesDataset.convert_unsigned_int_image_tensor_or_list_to_float_image_tensor_or_list(inputs) # https://github.com/pytorch/pytorch/issues/235 # Running the evaluation without computing gradients is the recommended way # since this saves time, and more importantly, memory with torch.no_grad(): # outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) max_input_width = NetworkToSoftMaxNetwork.get_max_input_width(inputs) outputs = multi_dimensional_rnn(inputs, max_input_width) probabilities_sum_to_one_dimension = 2 # Outputs is the output of the linear layer which is the input to warp_ctc # But to get probabilities for the decoder, the softmax function needs to # be applied to the outputs probabilities = torch.nn.functional. \ softmax(outputs, probabilities_sum_to_one_dimension) # No longer necessary with fixed word separator specification in decoder # and normal language model # probabilities = Evaluator.append_preceding_word_separator_to_probabilities( # probabilities, vocab_list, Evaluator.WORD_SEPARATOR_SYMBOL) print(">>> evaluate_mdrnn - outputs.size: " + str(outputs.size())) print(">>> evaluate_mdrnn - probabilities.size: " + str(probabilities.size())) # beam_size = 20 # This is the problem perhaps... # beam_size = 100 # The normal default is 100 beam_size = Evaluator.BEAM_SIZE # Larger value to see if it further improves results # This value specifies the number of (character) probabilities kept in the # decoder. If it is set equal or larger to the number of characters in the # vocabulary, no pruning is done for it cutoff_top_n = len(vocab_list) # No pruning for this parameter print(">>> evaluate_mdrnn - len(vocab_list): " + str(len(vocab_list))) decoder = Evaluator.create_decoder(vocab_list, cutoff_top_n, beam_size, blank_symbol, language_model_parameters) label_sizes = WarpCTCLossInterface. \ create_sequence_lengths_specification_tensor_different_lengths(labels) sequence_lengths = WarpCTCLossInterface.\ create_probabilities_lengths_specification_tensor_different_lengths( labels, horizontal_reduction_factor, probabilities) sequence_lengths = Evaluator.increase_sequence_lengths_by_one(sequence_lengths) # print(">>> evaluate_mdrnn - sequence lengths: " + str(sequence_lengths)) # print("probabilities.data.size(): " + str(probabilities.data.size())) beam_results, beam_scores, timesteps, out_seq_len = \ decoder.decode(probabilities.data, sequence_lengths) # print(">>> evaluate_mdrnn - beam_results: " + str(beam_results)) total += labels.size(0) for example_index in range(0, beam_results.size(0)): beam_results_sequence = beam_results[example_index][0] # print("beam_results_sequence: \"" + str(beam_results_sequence) + "\"") use_language_model_in_decoder = language_model_parameters is not None output_string = Evaluator.convert_to_string( beam_results_sequence, vocab_list, out_seq_len[example_index][0], use_language_model_in_decoder) example_labels_with_padding = labels[example_index] # Extract the real example labels, removing the padding labels reference_labels = example_labels_with_padding[0:label_sizes[example_index]] # print(">>> evaluate_mdrnn - reference_labels: " + str(reference_labels)) reference_labels_string = Evaluator.convert_labels_tensor_to_string( reference_labels, vocab_list, blank_symbol) if reference_labels_string == output_string: # print("Yaaaaah, got one correct!!!") correct += 1 correct_string = "correct" else: correct_string = "wrong" print(">>> evaluate_mdrnn - output: \"" + output_string + "\" " + "\nreference: \"" + reference_labels_string + "\" --- " + correct_string) output_strings.append(output_string) reference_labels_strings.append(reference_labels_string) # correct += (predicted == labels).sum() cer_including_word_separators = evaluation_metrics.character_error_rate. \ compute_character_error_rate_for_list_of_output_reference_pairs_fast( output_strings, reference_labels_strings, True) cer_excluding_word_separators = evaluation_metrics.character_error_rate. \ compute_character_error_rate_for_list_of_output_reference_pairs_fast( output_strings, reference_labels_strings, False) wer = evaluation_metrics.word_error_rate. \ compute_word_error_rate_for_list_of_output_reference_pairs( output_strings, reference_labels_strings) total_examples = len(test_loader.dataset) validation_stats = ValidationStats(total_examples, correct, cer_excluding_word_separators, wer) # https://stackoverflow.com/questions/3395138/using-multiple-arguments-for-string-formatting-in-python-e-g-s-s print("Accuracy of the network on the {} test inputs: {:.2f} % accuracy".format( total_examples, validation_stats.get_accuracy())) print("Character Error Rate (CER)[%] of the network on the {} test inputs, " "including word separators: {:.3f} CER".format( total_examples, cer_including_word_separators)) print("Character Error Rate (CER)[%] of the network on the {} test inputs, " "excluding word separators: {:.3f} CER".format( total_examples, cer_excluding_word_separators)) print("Word Error Rate (WER)[%] of the network on the {} test inputs: {:.3f} WER".format( total_examples, wer)) if save_score_table_file_path is not None: score_file_existed = os.path.exists(save_score_table_file_path) # Opens the file in append-mode, create if it doesn't exists with open(save_score_table_file_path, "a") as scores_table_file: if not score_file_existed: scores_table_file.write(Evaluator.score_table_header(total_examples, epoch_statistics)) scores_table_file.write(Evaluator.score_table_line(epoch_number, correct, validation_stats.get_accuracy(), cer_including_word_separators, cer_excluding_word_separators, wer, epoch_statistics) + "\n") return validation_stats
def train_one_epoch(self, train_loader, epoch: int, start: int, batch_size, device, inputs_is_list: bool, report_func=None): """ Train next epoch. Args: train_iter: training data iterator epoch(int): the epoch number report_func(fn): function for logging train_loader: the train loader, start: time in seconds training started return: Average loss per minibatch, total_examples """ # if isinstance(self.model, torch.nn.DataParallel): # device = self.model.module.get_device() # else: # device = self.model.get_device() num_gradient_corrections = 0 gradient_norms_sum = 0 running_loss = 0.0 total_summed_loss_epoch = 0.0 total_examples = 0 number_of_minibatches = 0 time_start = time.time() for i, data in enumerate(train_loader, 0): time_start_batch = time.time() # get the inputs inputs, labels = data # This one might expect to make things faster, but it doesn't seems # to help yet # inputs = TensorUtils.get_pinned_memory_copy_of_list(inputs) Trainer.check_there_are_no_zero_labels(labels, inputs_is_list) # If minimize_horizontal_padding is used, inputs will be a list if Utils.use_cuda(): if not inputs_is_list: inputs = inputs.to(device) else: inputs = Utils.move_tensor_list_to_device(inputs, device) # If the image input comes in the form of unsigned ints, they need to # be converted to floats (after moving to GPU, i.e. directly on GPU # which is faster) if self.model_properties.image_input_is_unsigned_int: Trainer.check_inputs_is_right_type(inputs, inputs_is_list) inputs = IamLinesDataset.convert_unsigned_int_image_tensor_or_list_to_float_image_tensor_or_list( inputs) if inputs_is_list: for element in inputs: element.requires_grad_(True) else: # Set requires_grad(True) directly and only for the input inputs.requires_grad_(True) # wrap them in Variable # labels = Variable(labels) # Labels need no gradient apparently # if Utils.use_cuda(): # Labels must remain on CPU for warp-ctc loss # labels = labels.to(device) # print("inputs: " + str(inputs)) # forward + backward + optimize # outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - labels.size(): " + str(labels.size())) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - inputs.size(): " + str(inputs.size())) # print("train_multi_dimensional_rnn_ctc.train_mdrnn - inputs: " + str(inputs)) time_start_network_forward = util.timing.date_time_now() max_input_width = NetworkToSoftMaxNetwork.get_max_input_width( inputs) outputs = self.model(inputs, max_input_width) # print("Time used for network forward: " + str(util.timing.milliseconds_since(time_start_network_forward))) # print(">>> outputs.size(): " + str(outputs.size())) # print(">>> labels.size() : " + str(labels.size())) # print("labels: " + str(labels)) # warp_ctc_loss_interface. # print(">>> labels_one_dimensional.size() : " + str(labels_one_dimensional.size())) # print("labels_one_dimensional: " + str(labels_one_dimensional)) # print("outputs: " + str(outputs)) # print("outputs.size(): " + str(outputs.size())) # print("labels: " + str(labels)) if inputs_is_list: number_of_examples = len(inputs) else: number_of_examples = inputs.size(0) time_start_ctc_loss_computation = util.timing.date_time_now() # print("trainer - outputs.size(): " + str(outputs.size())) loss = self.warp_ctc_loss_interface.compute_ctc_loss( outputs, labels, number_of_examples, self.model_properties.width_reduction_factor) total_examples += number_of_examples # print("Time used for ctc loss computation: " + # str(util.timing.milliseconds_since(time_start_ctc_loss_computation))) # See: https://github.com/SeanNaren/deepspeech.pytorch/blob/master/train.py # The averaging seems to help learning (but a smaller learning rate # might have the same effect!) loss = loss / number_of_examples # average the loss by minibatch size loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.item() # print("loss: " + str(loss)) # loss = criterion(outputs, labels) time_start_loss_backward = util.timing.date_time_now() # zero the parameter gradients self.optimizer.zero_grad() self.model.zero_grad() # get_dot = modules.find_bad_gradients.register_hooks(outputs) loss = loss.contiguous() loss.backward() # https://discuss.pytorch.org/t/how-to-check-for-vanishing-exploding-gradients/9019/4 #for p, n in zip(self.model.parameters(), self.model._all_weights[0]): # if n[:6] == 'weight': # print('===========\ngradient:{}\n----------\n{}'.format(n, p.grad)) # for name, p in self.model.named_parameters(): # print('===========\ngradient {} \n----------\n{}'.format(name, p.grad)) # dot = get_dot() # dot.save('mdlstm_ctc_no_data_parallel_find_bad_gradients-clamp-pad-function.dot') # render('dot', 'png', 'mdlstm_ctc_mnist_find_bad_gradients.dot') # print("Time used for loss backward: " + str(util.timing.milliseconds_since(time_start_loss_backward))) # raise RuntimeError("stopping after find bad gradients") # Perform step including gradient clipping # made_gradient_norm_based_correction, total_norm = self.optimizer.step() # Perform an update step, including norm-based gradient clipping. Compensate the maximum gradient # norm by the factor: number_of_examples/batch_size. This is to avoid over-correction (too much learning) # for the last batch, which contains less examples. made_gradient_norm_based_correction, total_norm = self.optimizer.step_with_scaling_for_size_current_batch( number_of_examples, batch_size) print("trainer - total norm: " + str(total_norm)) if made_gradient_norm_based_correction: num_gradient_corrections += 1 gradient_norms_sum += total_norm # print statistics # print("loss.data: " + str(loss.data)) # print("loss.data[0]: " + str(loss.data[0])) running_loss += loss_value total_summed_loss_epoch += loss_value # if i % 2000 == 1999: # print every 2000 mini-batches # See: https://stackoverflow.com/questions/5598181/python-multiple-prints-on-the-same-line # print(str(i)+",", end="", flush=True) if i % 10 == 9: # print every 10 mini-batches end = time.time() running_time = end - start print('[%d, %5d] loss: %.3f' % (epoch, i + 1, running_loss / 10) + " Running time: " + str(running_time)) average_norm = gradient_norms_sum / 10 print("Number of gradient norm-based corrections: " + str(num_gradient_corrections)) print("Average gradient total norm: " + str(average_norm)) running_loss = 0.0 num_gradient_corrections = 0 gradient_norms_sum = 0 percent = (i + 1) / float(len(train_loader)) examples_processed = (i + 1) * batch_size total_examples = len(train_loader.dataset) print("Processed " + str(examples_processed) + " of " + str(total_examples) + " examples in this epoch") print(">>> Time used in current epoch: " + str( util.timing.time_since_and_expected_remaining_time( time_start, percent))) sys.stdout.flush() number_of_minibatches += 1 average_loss_per_minibatch = total_summed_loss_epoch / number_of_minibatches return average_loss_per_minibatch, total_examples