Exemple #1
0
def _compute_pytorch(
    model_names,
    batch_sizes,
    slice_sizes,
    dictionary,
    average_over,
    device,
    torchscript,
    fp16,
    no_speed,
    no_memory,
    verbose,
):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name,
                                            torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text,
                                              add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {},
            "memory": {}
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()

            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][
                        slice_size] = "N/A"
                else:
                    sequence = torch.tensor(tokenized_sequence[:slice_size],
                                            device=device).repeat(
                                                batch_size, 1)
                    try:
                        if torchscript:
                            print("Tracing model with sequence size",
                                  sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            inference(sequence)

                        if not no_memory:
                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)

                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
                            trace = start_memory_tracing("transformers")
                            inference(sequence)
                            summary = stop_memory_tracing(trace)

                            if verbose:
                                print_summary_statistics(summary)

                            dictionary[model_name]["memory"][batch_size][
                                slice_size] = str(summary.total)
                        else:
                            dictionary[model_name]["memory"][batch_size][
                                slice_size] = "N/A"

                        if not no_speed:
                            print("Going through model with sequence of shape",
                                  sequence.shape)
                            runtimes = timeit.repeat(
                                lambda: inference(sequence),
                                repeat=average_over,
                                number=3)
                            average_time = sum(runtimes) / float(
                                len(runtimes)) / 3.0
                            dictionary[model_name]["results"][batch_size][
                                slice_size] = average_time
                        else:
                            dictionary[model_name]["results"][batch_size][
                                slice_size] = "N/A"

                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][
                            slice_size] = "N/A"
                        dictionary[model_name]["memory"][batch_size][
                            slice_size] = "N/A"
    return dictionary
Exemple #2
0
def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary,
                        average_over, amp, no_speed, no_memory, verbose):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name)
        model = TFAutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text,
                                              add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {},
            "memory": {}
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}

        print("Using model", model)

        @tf.function
        def inference(inputs):
            return model(inputs)

        for batch_size in batch_sizes:
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][
                        slice_size] = "N/A"
                else:
                    sequence = tf.stack([
                        tf.squeeze(
                            tf.constant(
                                tokenized_sequence[:slice_size])[None, :])
                    ] * batch_size)

                    try:
                        print("Going through model with sequence of shape",
                              sequence.shape)
                        # To make sure that the model is traced + that the tensors are on the appropriate device
                        inference(sequence)

                        if not no_memory:
                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
                            trace = start_memory_tracing("transformers")
                            inference(sequence)
                            summary = stop_memory_tracing(trace)

                            if verbose:
                                print_summary_statistics(summary)

                            dictionary[model_name]["memory"][batch_size][
                                slice_size] = str(summary.total)
                        else:
                            dictionary[model_name]["memory"][batch_size][
                                slice_size] = "N/A"

                        if not no_speed:
                            runtimes = timeit.repeat(
                                lambda: inference(sequence),
                                repeat=average_over,
                                number=3)
                            average_time = sum(runtimes) / float(
                                len(runtimes)) / 3.0
                            dictionary[model_name]["results"][batch_size][
                                slice_size] = average_time
                        else:
                            dictionary[model_name]["results"][batch_size][
                                slice_size] = "N/A"

                    except tf.errors.ResourceExhaustedError as e:
                        print("Doesn't fit on GPU.", e)
                        dictionary[model_name]["results"][batch_size][
                            slice_size] = "N/A"
                        dictionary[model_name]["memory"][batch_size][
                            slice_size] = "N/A"
    return dictionary
def _compute_pytorch(
    model_names,
    batch_sizes,
    slice_sizes,
    dictionary,
    average_over,
    device,
    torchscript,
    fp16,
    no_speed,
    no_memory,
    verbose,
    num_hashes
):

    hidden_size = 64
    num_attention_heads = 2
    intermediate_size = 128

    chunk_length = 64
    num_hashes = num_hashes

    hidden_states = floats_tensor((1, 2 ** 16, hidden_size))

    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {},
            "memory": {},
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:

            for slice_size in slice_sizes:

                num_buckets = int(2 * slice_size / chunk_length)
                if num_buckets > chunk_length:
                    factorized_num_buckets = num_buckets // 32
                    num_buckets = [32, factorized_num_buckets]

                bert_config = BertConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    hidden_dropout_prob=0.0,
                    attention_probs_dropout_prob=0.0,
                )

                reformer_config = ReformerConfig(
                    hidden_size=hidden_size,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    chunk_length=chunk_length,
                    num_hashes=num_hashes,
                    num_buckets=num_buckets
                )

                layers = {
                    'ReformerLayer': ReformerLayer(reformer_config), 
                    'BertLayer': BertLayer(bert_config)
                }
                model = layers[model_name]

                if fp16:
                    model.half()
                model.to(device)
                model.eval()

                if False:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = (
                        hidden_states[0, :slice_size, :]
                        .to(device=device)
                        .repeat(batch_size, 1, 1)
                    )
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)

                        if not no_memory:
                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)

                            trace = start_memory_tracing("transformers")
                            if model_name == "ReformerLayer":
                                inference(sequence, sequence)
                            else:
                                inference(sequence)
                            summary = stop_memory_tracing(trace)

                            if verbose:
                                print_summary_statistics(summary)

                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = str(summary.total)
                        else:
                            dictionary[model_name]["memory"][batch_size][
                                slice_size
                            ] = "N/A"

                        if not no_speed:
                            print(
                                "Going through model with sequence of shape",
                                sequence.shape,
                            )
                            if model_name == "ReformerLayer":
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence, sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            else:
                                runtimes = timeit.repeat(
                                    lambda: inference(sequence),
                                    repeat=average_over,
                                    number=3,
                                )
                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = average_time
                        else:
                            dictionary[model_name]["results"][batch_size][
                                slice_size
                            ] = "N/A"

                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][
                            slice_size
                        ] = "N/A"
                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
    return dictionary