def _compute_pytorch( model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose, ): for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") config = AutoConfig.from_pretrained(model_name, torchscript=torchscript) model = AutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {} } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} dictionary[model_name]["memory"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: if fp16: model.half() model.to(device) model.eval() for slice_size in slice_sizes: if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" else: sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat( batch_size, 1) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model inference(sequence) if not no_memory: # model.add_memory_hooks() # Forward method tracing (only for PyTorch models) # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code trace = start_memory_tracing("transformers") inference(sequence) summary = stop_memory_tracing(trace) if verbose: print_summary_statistics(summary) dictionary[model_name]["memory"][batch_size][ slice_size] = str(summary.total) else: dictionary[model_name]["memory"][batch_size][ slice_size] = "N/A" if not no_speed: print("Going through model with sequence of shape", sequence.shape) runtimes = timeit.repeat( lambda: inference(sequence), repeat=average_over, number=3) average_time = sum(runtimes) / float( len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size] = average_time else: dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" dictionary[model_name]["memory"][batch_size][ slice_size] = "N/A" return dictionary
def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose): for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") config = AutoConfig.from_pretrained(model_name) model = TFAutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {} } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} dictionary[model_name]["memory"] = {i: {} for i in batch_sizes} print("Using model", model) @tf.function def inference(inputs): return model(inputs) for batch_size in batch_sizes: for slice_size in slice_sizes: if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" else: sequence = tf.stack([ tf.squeeze( tf.constant( tokenized_sequence[:slice_size])[None, :]) ] * batch_size) try: print("Going through model with sequence of shape", sequence.shape) # To make sure that the model is traced + that the tensors are on the appropriate device inference(sequence) if not no_memory: # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code trace = start_memory_tracing("transformers") inference(sequence) summary = stop_memory_tracing(trace) if verbose: print_summary_statistics(summary) dictionary[model_name]["memory"][batch_size][ slice_size] = str(summary.total) else: dictionary[model_name]["memory"][batch_size][ slice_size] = "N/A" if not no_speed: runtimes = timeit.repeat( lambda: inference(sequence), repeat=average_over, number=3) average_time = sum(runtimes) / float( len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size] = average_time else: dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" except tf.errors.ResourceExhaustedError as e: print("Doesn't fit on GPU.", e) dictionary[model_name]["results"][batch_size][ slice_size] = "N/A" dictionary[model_name]["memory"][batch_size][ slice_size] = "N/A" return dictionary
def _compute_pytorch( model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose, num_hashes ): hidden_size = 64 num_attention_heads = 2 intermediate_size = 128 chunk_length = 64 num_hashes = num_hashes hidden_states = floats_tensor((1, 2 ** 16, hidden_size)) for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}, } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} dictionary[model_name]["memory"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: for slice_size in slice_sizes: num_buckets = int(2 * slice_size / chunk_length) if num_buckets > chunk_length: factorized_num_buckets = num_buckets // 32 num_buckets = [32, factorized_num_buckets] bert_config = BertConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, ) reformer_config = ReformerConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, chunk_length=chunk_length, num_hashes=num_hashes, num_buckets=num_buckets ) layers = { 'ReformerLayer': ReformerLayer(reformer_config), 'BertLayer': BertLayer(bert_config) } model = layers[model_name] if fp16: model.half() model.to(device) model.eval() if False: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: sequence = ( hidden_states[0, :slice_size, :] .to(device=device) .repeat(batch_size, 1, 1) ) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) if not no_memory: # model.add_memory_hooks() # Forward method tracing (only for PyTorch models) trace = start_memory_tracing("transformers") if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) summary = stop_memory_tracing(trace) if verbose: print_summary_statistics(summary) dictionary[model_name]["memory"][batch_size][ slice_size ] = str(summary.total) else: dictionary[model_name]["memory"][batch_size][ slice_size ] = "N/A" if not no_speed: print( "Going through model with sequence of shape", sequence.shape, ) if model_name == "ReformerLayer": runtimes = timeit.repeat( lambda: inference(sequence, sequence), repeat=average_over, number=3, ) else: runtimes = timeit.repeat( lambda: inference(sequence), repeat=average_over, number=3, ) average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size ] = average_time else: dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" dictionary[model_name]["memory"][batch_size][slice_size] = "N/A" return dictionary