Esempio n. 1
0
    def create_and_check_reformer_random_seed(self, config, input_ids,
                                              input_mask, choice_labels):
        layer = ReformerLayer(config).to(torch_device)
        layer.train()

        shape = (
            self.batch_size,
            self.seq_length,
            config.hidden_size,
        )  # Batch x SeqLen x hiddenSize

        hidden_states = floats_tensor(shape)
        attn_output = floats_tensor(shape)

        seeds = []
        for _ in range(100):
            layer_outputs = layer(attn_output,
                                  hidden_states,
                                  attention_mask=input_mask)
            attn_output = layer_outputs.attn_output
            hidden_states = layer_outputs.hidden_states
            torch.manual_seed(layer.attention_seed)
            seeds.append(layer.attention_seed)
        self.parent.assertGreater(len(set(seeds)), 70)

        seeds = []
        for _ in range(100):
            layer_outputs = layer(attn_output,
                                  hidden_states,
                                  attention_mask=input_mask)
            attn_output = layer_outputs.attn_output
            hidden_states = layer_outputs.hidden_states
            torch.manual_seed(layer.feed_forward_seed)
            seeds.append(layer.feed_forward_seed)
        self.parent.assertGreater(len(set(seeds)), 70)
 def test_local_layer_forward_complex(self):
     config = self._get_basic_config_and_input()
     config["attn_layers"] = ["local"]
     attn_mask = self._get_attn_mask()
     hidden_states = self._get_hidden_states()
     torch.manual_seed(0)
     layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
     layer.eval()
     reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states, attention_mask=attn_mask,)
     output_slice = reformer_output.hidden_states[0, 0, :5]
     expected_output_slice = torch.tensor(
         [1.5476, -1.9020, -0.9902, 1.5013, -0.1950], dtype=torch.float, device=torch_device,
     )
     self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 def test_local_layer_forward(self):
     config = self._get_basic_config_and_input()
     config["attn_layers"] = ["local"]
     config["is_decoder"] = False
     hidden_states = self._get_hidden_states()
     torch.manual_seed(0)
     layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
     layer.eval()
     reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
     output_slice = reformer_output.hidden_states[0, 0, :5]
     expected_output_slice = torch.tensor(
         [1.4212, -2.0576, -0.9688, 1.4599, -0.1344], dtype=torch.float, device=torch_device,
     )
     self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 def test_lsh_layer_forward(self):
     config = self._get_basic_config_and_input()
     config["attn_layers"] = ["lsh"]
     config["is_decoder"] = False
     hidden_states = self._get_hidden_states()
     torch.manual_seed(0)
     layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
     layer.eval()
     reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states)
     output_slice = reformer_output.hidden_states[0, 0, :5]
     expected_output_slice = torch.tensor(
         [1.6879, -1.3083, -0.4708, 1.3555, -0.6292], dtype=torch.float, device=torch_device,
     )
     self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 def test_lsh_layer_forward_complex(self):
     config = self._get_basic_config_and_input()
     config["attn_layers"] = ["lsh"]
     config["num_buckets"] = [2, 4]
     attn_mask = self._get_attn_mask()
     hidden_states = self._get_hidden_states()
     torch.manual_seed(0)
     layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
     layer.eval()
     reformer_output = layer(
         prev_attn_output=hidden_states.clone(), hidden_states=hidden_states, attention_mask=attn_mask,
     )
     output_slice = reformer_output.hidden_states[0, 0, :5]
     expected_output_slice = torch.tensor(
         [1.6439, -1.2306, -0.5108, 1.3006, -0.6537], dtype=torch.float, device=torch_device,
     )
     self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
    def create_and_check_reformer_layer_dropout_seed(self, config, input_ids, input_mask, is_decoder):
        config.is_decoder = is_decoder
        layer = ReformerLayer(config).to(torch_device)
        layer.train()
        shape = (
            self.batch_size,
            self.seq_length,
            config.hidden_size,
        )  # Batch x SeqLen x hiddenSize

        # get random tensors
        hidden_states = floats_tensor(shape)
        prev_attn_output = floats_tensor(shape)

        # now the random seeds for attention and feed forward is initialized
        # forward tensors with dropout
        layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask)

        next_attn_output = layer_outputs.attn_output
        next_hidden_states = layer_outputs.hidden_states

        torch.manual_seed(layer.attention_seed)
        attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
        self.parent.assertTrue(
            torch.allclose(prev_attn_output + attn_outputs.hidden_states, next_attn_output, atol=1e-3,)
        )

        torch.manual_seed(layer.feed_forward_seed)
        feed_forward_hidden_states = layer.feed_forward(next_attn_output)
        self.parent.assertTrue(
            torch.allclose(next_hidden_states, hidden_states + feed_forward_hidden_states, atol=1e-3,)
        )
def _compute_pytorch(
    model_names,
    batch_sizes,
    slice_sizes,
    dictionary,
    average_over,
    device,
    torchscript,
    fp16,
    no_speed,
    no_memory,
    verbose,
    num_hashes
):

    hidden_size = 64
    num_attention_heads = 2
    intermediate_size = 128

    chunk_length = 64
    num_hashes = num_hashes

    hidden_states = floats_tensor((1, 2 ** 16, hidden_size))

    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {},
            "memory": {},
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:

            for slice_size in slice_sizes:

                num_buckets = int(2 * slice_size / chunk_length)
                if num_buckets > chunk_length:
                    factorized_num_buckets = num_buckets // 32
                    num_buckets = [32, factorized_num_buckets]

                bert_config = BertConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    hidden_dropout_prob=0.0,
                    attention_probs_dropout_prob=0.0,
                )

                reformer_config = ReformerConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    chunk_length=chunk_length,
                    num_hashes=num_hashes,
                    num_buckets=num_buckets
                )

                layers = {
                    'ReformerLayer': ReformerLayer(reformer_config), 
                    'BertLayer': BertLayer(bert_config)
                }
                model = layers[model_name]

                if fp16:
                    model.half()
                model.to(device)
                model.eval()

                if False:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = (
                        hidden_states[0, :slice_size, :]
                        .to(device=device)
                        .repeat(batch_size, 1, 1)
                    )
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)

                        if not no_memory:
                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)

                            trace = start_memory_tracing("transformers")
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)
                            summary = stop_memory_tracing(trace)

                            if verbose:
                                print_summary_statistics(summary)

                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = str(summary.total)
                        else:
                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = "N/A"

                        if not no_speed:
                            print(
                                "Going through model with sequence of shape",
                                sequence.shape,
                            )
                            if model_name == "ReformerLayer":
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence, sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            else:
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = average_time
                        else:
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = "N/A"

                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][
                            slice_size
                        ] = "N/A"
                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
    return dictionary