def create_and_check_reformer_random_seed(self, config, input_ids, input_mask, choice_labels): layer = ReformerLayer(config).to(torch_device) layer.train() shape = ( self.batch_size, self.seq_length, config.hidden_size, ) # Batch x SeqLen x hiddenSize hidden_states = floats_tensor(shape) attn_output = floats_tensor(shape) seeds = [] for _ in range(100): layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask) attn_output = layer_outputs.attn_output hidden_states = layer_outputs.hidden_states torch.manual_seed(layer.attention_seed) seeds.append(layer.attention_seed) self.parent.assertGreater(len(set(seeds)), 70) seeds = [] for _ in range(100): layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask) attn_output = layer_outputs.attn_output hidden_states = layer_outputs.hidden_states torch.manual_seed(layer.feed_forward_seed) seeds.append(layer.feed_forward_seed) self.parent.assertGreater(len(set(seeds)), 70)
def test_local_layer_forward_complex(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["local"] attn_mask = self._get_attn_mask() hidden_states = self._get_hidden_states() torch.manual_seed(0) layer = ReformerLayer(ReformerConfig(**config)).to(torch_device) layer.eval() reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states, attention_mask=attn_mask,) output_slice = reformer_output.hidden_states[0, 0, :5] expected_output_slice = torch.tensor( [1.5476, -1.9020, -0.9902, 1.5013, -0.1950], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_local_layer_forward(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["local"] config["is_decoder"] = False hidden_states = self._get_hidden_states() torch.manual_seed(0) layer = ReformerLayer(ReformerConfig(**config)).to(torch_device) layer.eval() reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states) output_slice = reformer_output.hidden_states[0, 0, :5] expected_output_slice = torch.tensor( [1.4212, -2.0576, -0.9688, 1.4599, -0.1344], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_lsh_layer_forward(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["lsh"] config["is_decoder"] = False hidden_states = self._get_hidden_states() torch.manual_seed(0) layer = ReformerLayer(ReformerConfig(**config)).to(torch_device) layer.eval() reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states) output_slice = reformer_output.hidden_states[0, 0, :5] expected_output_slice = torch.tensor( [1.6879, -1.3083, -0.4708, 1.3555, -0.6292], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_lsh_layer_forward_complex(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["lsh"] config["num_buckets"] = [2, 4] attn_mask = self._get_attn_mask() hidden_states = self._get_hidden_states() torch.manual_seed(0) layer = ReformerLayer(ReformerConfig(**config)).to(torch_device) layer.eval() reformer_output = layer( prev_attn_output=hidden_states.clone(), hidden_states=hidden_states, attention_mask=attn_mask, ) output_slice = reformer_output.hidden_states[0, 0, :5] expected_output_slice = torch.tensor( [1.6439, -1.2306, -0.5108, 1.3006, -0.6537], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def create_and_check_reformer_layer_dropout_seed(self, config, input_ids, input_mask, is_decoder): config.is_decoder = is_decoder layer = ReformerLayer(config).to(torch_device) layer.train() shape = ( self.batch_size, self.seq_length, config.hidden_size, ) # Batch x SeqLen x hiddenSize # get random tensors hidden_states = floats_tensor(shape) prev_attn_output = floats_tensor(shape) # now the random seeds for attention and feed forward is initialized # forward tensors with dropout layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask) next_attn_output = layer_outputs.attn_output next_hidden_states = layer_outputs.hidden_states torch.manual_seed(layer.attention_seed) attn_outputs = layer.attention(hidden_states, attention_mask=input_mask) self.parent.assertTrue( torch.allclose(prev_attn_output + attn_outputs.hidden_states, next_attn_output, atol=1e-3,) ) torch.manual_seed(layer.feed_forward_seed) feed_forward_hidden_states = layer.feed_forward(next_attn_output) self.parent.assertTrue( torch.allclose(next_hidden_states, hidden_states + feed_forward_hidden_states, atol=1e-3,) )
def _compute_pytorch( model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose, num_hashes ): hidden_size = 64 num_attention_heads = 2 intermediate_size = 128 chunk_length = 64 num_hashes = num_hashes hidden_states = floats_tensor((1, 2 ** 16, hidden_size)) for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") dictionary[model_name] = { "bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}, } dictionary[model_name]["results"] = {i: {} for i in batch_sizes} dictionary[model_name]["memory"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: for slice_size in slice_sizes: num_buckets = int(2 * slice_size / chunk_length) if num_buckets > chunk_length: factorized_num_buckets = num_buckets // 32 num_buckets = [32, factorized_num_buckets] bert_config = BertConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, ) reformer_config = ReformerConfig( hidden_size=hidden_size, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, chunk_length=chunk_length, num_hashes=num_hashes, num_buckets=num_buckets ) layers = { 'ReformerLayer': ReformerLayer(reformer_config), 'BertLayer': BertLayer(bert_config) } model = layers[model_name] if fp16: model.half() model.to(device) model.eval() if False: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: sequence = ( hidden_states[0, :slice_size, :] .to(device=device) .repeat(batch_size, 1, 1) ) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) if not no_memory: # model.add_memory_hooks() # Forward method tracing (only for PyTorch models) trace = start_memory_tracing("transformers") if model_name == "ReformerLayer": inference(sequence, sequence) else: inference(sequence) summary = stop_memory_tracing(trace) if verbose: print_summary_statistics(summary) dictionary[model_name]["memory"][batch_size][ slice_size ] = str(summary.total) else: dictionary[model_name]["memory"][batch_size][ slice_size ] = "N/A" if not no_speed: print( "Going through model with sequence of shape", sequence.shape, ) if model_name == "ReformerLayer": runtimes = timeit.repeat( lambda: inference(sequence, sequence), repeat=average_over, number=3, ) else: runtimes = timeit.repeat( lambda: inference(sequence), repeat=average_over, number=3, ) average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][ slice_size ] = average_time else: dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][ slice_size ] = "N/A" dictionary[model_name]["memory"][batch_size][slice_size] = "N/A" return dictionary