Ejemplo n.º 1
0
    def test(self, model_family, model_name, task):
        local_rank = os.getenv("LOCAL_RANK", "0")
        device = torch.device(f"cuda:{local_rank}")
        dtype = torch.float
        task_dict = lm_eval.tasks.get_task_dict([task])

        if 'gpt-j-6B' in model_name:
            dtype = torch.half
            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
                f"pretrained={model_name}", {"device": "cpu"})
            setattr(lm, model_family,
                    getattr(lm, model_family).half().to(device))
            lm._device = device
        else:
            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
                f"pretrained={model_name}", {"device": f"cuda:{local_rank}"})

        torch.cuda.synchronize()
        start = time.time()
        bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
        torch.cuda.synchronize()
        bs_time = time.time() - start

        ds_model = deepspeed.init_inference(
            getattr(lm, model_family),
            mp_size=1,
            dtype=dtype,
            replace_method="auto",
            replace_with_kernel_inject=True,
            enable_cuda_graph=False,
        )
        setattr(lm, model_family, ds_model)
        torch.cuda.synchronize()
        start = time.time()
        ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
        torch.cuda.synchronize()
        ds_time = time.time() - start

        ppl_diff = abs(bs_output["results"][task]["ppl"] -
                       ds_output["results"][task]["ppl"])
        #assert ds_time <= bs_time
        assert ppl_diff < 0.01
Ejemplo n.º 2
0
                        default=1,
                        help="Model parallel size.")

    args = parser.parse_args()

    args.batch_size = 1

    args = load_hyperparam(args)

    args.tokenizer = str2tokenizer[args.tokenizer](args)

    model = GenerateLm(args)
    model = load_model(model, args.load_model_path)
    deepspeed.init_distributed()
    model = deepspeed.init_inference(model=model,
                                     mp_size=args.mp_size,
                                     replace_method=None)

    rank = dist.get_rank()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if rank == 0:
        model.eval()

        with open(args.test_path, mode="r", encoding="utf-8") as f:
            line = f.readline().strip()
            src = args.tokenizer.convert_tokens_to_ids(
                [CLS_TOKEN] + args.tokenizer.tokenize(line))
            seg = [1] * len(src)
            beginning_length = len(src)
            if len(src) > args.seq_length:
                src = src[:args.seq_length]
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")

    tokenizer_opts(parser)

    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")

    deepspeed_opts(parser)
    parser.add_argument("--mp_size", type=int, default=1, help="Model parallel size.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    deepspeed.init_distributed()
    model = Classifier(args)

    if args.load_model_path:
        model = load_model(model, args.load_model_path)

    model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None)

    rank = dist.get_rank()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if rank == 0:
        dataset = read_dataset(args, args.test_path)

        src = torch.LongTensor([sample[0] for sample in dataset])
        seg = torch.LongTensor([sample[1] for sample in dataset])

        batch_size = args.batch_size
        instances_num = src.size()[0]

        print("The number of prediction instances: ", instances_num)

        model.eval()

        with open(args.prediction_path, mode="w", encoding="utf-8") as f:
            f.write("label")
            if args.output_logits:
                f.write("\t" + "logits")
            if args.output_prob:
                f.write("\t" + "prob")
            f.write("\n")
            for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
                src_batch = src_batch.to(device)
                seg_batch = seg_batch.to(device)
                with torch.no_grad():
                    _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                prob = nn.Softmax(dim=1)(logits)
                logits = logits.cpu().numpy().tolist()
                prob = prob.cpu().numpy().tolist()

                for j in range(len(pred)):
                    f.write(str(pred[j]))
                    if args.output_logits:
                        f.write("\t" + " ".join([str(v) for v in logits[j]]))
                    if args.output_prob:
                        f.write("\t" + " ".join([str(v) for v in prob[j]]))
                    f.write("\n")
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--sample_input",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(MODEL_CLASSES.keys()),
    )

    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--stop_token",
                        type=str,
                        default=None,
                        help="Token at which text generation is stopped")

    parser.add_argument(
        "--temperature",
        type=float,
        default=1.0,
        help=
        "temperature of 1.0 has no effect, lower tend toward greedy sampling",
    )
    parser.add_argument(
        "--repetition_penalty",
        type=float,
        default=1.0,
        help="primarily useful for CTRL model; in that case, use 1.2")
    parser.add_argument("--k", type=int, default=0)
    parser.add_argument("--p", type=float, default=0.9)

    parser.add_argument("--prefix",
                        type=str,
                        default="",
                        help="Text added prior to input.")
    parser.add_argument("--padding_text",
                        type=str,
                        default="",
                        help="Deprecated, the use of `--prefix` is preferred.")
    parser.add_argument("--xlm_language",
                        type=str,
                        default="",
                        help="Optional language when used with the XLM model.")

    parser.add_argument("--local_rank", type=int, default=0, help="local rank")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--num_return_sequences",
                        type=int,
                        default=1,
                        help="The number of samples to generate.")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument('--ds-inference',
                        action="store_true",
                        help="Use deepspeed")
    args = parser.parse_args()

    args.device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    logger.warning(
        "device: %s, n_gpu: %s, 16-bits training: %s",
        args.device,
        args.n_gpu,
        args.fp16,
    )

    set_seed(args)

    # Initialize the model and tokenizer
    try:
        args.model_type = args.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    except KeyError:
        raise KeyError(
            "the model {} you specified is not supported. You are welcome to add it and open a PR :)"
        )

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    model = model_class.from_pretrained(args.model_name_or_path)
    model.cuda(torch.cuda.current_device())

    if args.fp16:
        model.half()

    # intialize deepspeed engine
    if args.ds_inference:
        import deepspeed.module_inject as module_inject
        import deepspeed
        injection_policy = {
            gpt2_transformer: module_inject.replace_policy.HFGPT2LayerPolicy
        }
        model = deepspeed.init_inference(
            model,
            mp_size=1,
            dtype=(torch.half if args.fp16 else torch.float),
            injection_policy=injection_policy)
        model = model.module

    args.length = adjust_length_to_model(
        args.length, max_sequence_length=model.config.max_position_embeddings)
    logger.info(args)
    if args.sample_input:
        fname = open(args.sample_input, "r", encoding="utf8")
        prompt_text = fname.readlines()
    else:
        prompt_text = (args.prompt
                       if args.prompt else input("Model prompt >>> "), )

    # Different models need different input formatting and/or extra arguments
    requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
    eprompt = []
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
        for input_text in prompt_text:
            preprocessed_prompt_text.append(
                prepare_input(args, model, tokenizer, prompt_text))

            if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
                tokenizer_kwargs = {"add_space_before_punct_symbol": True}
            else:
                tokenizer_kwargs = {}
            for ppt in preprocessed_prompt_text:
                eprompt.append(
                    tokenizer.encode(ppt,
                                     add_special_tokens=False,
                                     return_tensors="pt",
                                     **tokenizer_kwargs))
    else:
        prefix = args.prefix if args.prefix else args.padding_text
        for ppt in prompt_text:
            eprompt.append(
                tokenizer.encode(prefix + ppt,
                                 add_special_tokens=False,
                                 return_tensors="pt"))

    latencies = []
    for encoded_prompt, ppt in zip(eprompt, prompt_text):
        encoded_prompt = encoded_prompt.to(args.device)

        if encoded_prompt.size()[-1] == 0:
            input_ids = None
        else:
            input_ids = encoded_prompt

        torch.cuda.synchronize()
        t0 = time.time()

        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=args.length + len(encoded_prompt[0]),
            temperature=args.temperature,
            top_k=args.k,
            top_p=args.p,
            repetition_penalty=args.repetition_penalty,
            do_sample=True,
            num_return_sequences=args.num_return_sequences,
        )
        torch.cuda.synchronize()
        latencies.append((time.time() - t0) / output_sequences.numel())

        # Remove the batch dimension when returning multiple sequences
        if len(output_sequences.shape) > 2:
            output_sequences.squeeze_()

        generated_sequences = []

        for generated_sequence_idx, generated_sequence in enumerate(
                output_sequences):
            print(
                "=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx +
                                                       1))
            generated_sequence = generated_sequence.tolist()

            # Decode text
            text = tokenizer.decode(generated_sequence,
                                    clean_up_tokenization_spaces=True)

            # Remove all text after the stop token
            text = text[:text.find(args.stop_token) if args.
                        stop_token else None]

            # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
            total_sequence = (ppt + text[len(
                tokenizer.
                decode(encoded_prompt[0], clean_up_tokenization_spaces=True)):]
                              )

            generated_sequences.append(total_sequence)
            print(total_sequence)
    print_latency(latencies)
    return generated_sequences
Ejemplo n.º 5
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            datasets = load_dataset("csv", data_files=data_files)
        else:
            # Loading a dataset from local json files
            datasets = load_dataset("json", data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    if not training_args.do_train:
        import torch
        # loading the model from the MoQ-trained checkpoint
        sd = torch.load('output/qnli/pytorch_model.bin')
        model.load_state_dict(sd)

        import deepspeed
        import deepspeed.module_inject as module_inject
        deepspeed.init_inference(model,
                                 mp_size=1,
                                 dtype=torch.int8,
                                 replace_method='auto',
                                 quantization_setting=8)

    # Preprocessing the datasets

    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in datasets["train"].column_names if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and data_args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: label_name_to_id[label_list[i]]
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [label_to_id[l] for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    train_dataset = datasets["train"]
    eval_dataset = datasets["validation_matched" if data_args.task_name ==
                            "mnli" else "validation"]
    if data_args.task_name is not None or data_args.test_file is not None:
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids)**2).mean().item()}
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item()
            }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()  # Saves the tokenizer too for easy upload
        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_eval_file, "w") as writer:
                    logger.info(f"***** Eval results {task} *****")
                    for key, value in sorted(eval_result.items()):
                        logger.info(f"  {key} = {value}")
                        writer.write(f"{key} = {value}\n")

            eval_results.update(eval_result)

    if training_args.do_predict:
        logger.info("*** Test ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(datasets["test_mismatched"])

        for test_dataset, task in zip(test_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            test_dataset.remove_columns_("label")
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            predictions = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)

            output_test_file = os.path.join(training_args.output_dir,
                                            f"test_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_test_file, "w") as writer:
                    logger.info(f"***** Test results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")
    return eval_results
Ejemplo n.º 6
0
from transformers import pipeline
import transformers
import deepspeed
import torch
import os
from transformers.models.roberta.modeling_roberta import RobertaLayer

local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '4'))

pipe = pipeline('fill-mask', model="roberta-large", device=local_rank)

# The inpjection_policy shows two things:
#   1. which layer module we need to add Tensor-Parallelism
#   2. the name of several linear layers: a) attention_output (both encoder and decoder),
#       and b) transformer output

pipe.model = deepspeed.init_inference(
    pipe.model,
    mp_size=world_size,
    dtype=torch.float,
    injection_policy={RobertaLayer: ('output.dense')})

pipe.device = torch.device(f'cuda:{local_rank}')
output = pipe("Hello I'm a <mask> model.")

if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
    print(output)
Ejemplo n.º 7
0
# Get local gpu rank from torch.distributed/deepspeed launcher
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

print(
    "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************"
    .format(local_rank,
            world_size))

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

model = deepspeed.init_inference(model,
                                 mp_size=world_size,
                                 dtype=torch.float,
                                 injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')},
                                 replace_with_kernel_inject=False)
model.to(f'cuda:{local_rank}')                                 
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

librispeech_eval = librispeech_eval.map(map_to_array)

def map_to_pred(batch):
    input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values.to(f'cuda:{local_rank}')).logits
Ejemplo n.º 8
0
from transformers.models.t5.modeling_t5 import T5Block

local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '4'))

pipe = pipeline("text2text-generation",
                model="google/t5-v1_1-small",
                device=local_rank)

# The inpjection_policy shows two things:
#   1. which layer module we need to add Tensor-Parallelism
#   2. the name of several linear layers: a) attention_output (both encoder and decoder),
#       and b) transformer output

pipe.model = deepspeed.init_inference(pipe.model,
                                      mp_size=world_size,
                                      dtype=torch.float,
                                      injection_policy={
                                          T5Block: ('SelfAttention.o',
                                                    'EncDecAttention.o',
                                                    'DenseReluDense.wo')
                                      })

pipe.device = torch.device(f'cuda:{local_rank}')
output = pipe(
    "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
)

if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
    print(output)
Ejemplo n.º 9
0
import os
import torch
import deepspeed
import transformers

from deepspeed import module_inject
from transformers import pipeline
from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock as gpt2_transformer

# Get local gpu rank from torch.distributed/deepspeed launcher
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

print(
    "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************"
    .format(local_rank,
            world_size))

generator = pipeline('text-generation',
                     model='EleutherAI/gpt-neo-2.7B',
                     device=local_rank)
generator.model = deepspeed.init_inference(generator.model,
                                           mp_size=world_size,
                                           dtype=torch.float,
                                           replace_method='auto')
string = generator("DeepSpeed is", do_sample=True, min_length=50)
print(string)
Ejemplo n.º 10
0
world_size = int(os.getenv('WORLD_SIZE', '1'))

print(
    "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************"
    .format(local_rank, world_size))
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

inp_tokens = tokenizer(
    "DeepSpeed is",
    return_tensors="pt",
)
model = deepspeed.init_inference(
    model,
    mp_size=world_size,
    dtype=torch.float,
    injection_policy={GPTJBlock: ('attn.out_proj', 'mlp.fc_out')},
    replace_with_kernel_inject=False)

for token in inp_tokens:
    if torch.is_tensor(inp_tokens[token]):
        inp_tokens[token] = inp_tokens[token].to(f'cuda:{local_rank}')

model.cuda().to(f'cuda:{local_rank}')
string = tokenizer.batch_decode(model.generate(
    **inp_tokens,
    min_length=50,
))[0]
print(string)
Ejemplo n.º 11
0
    def __init__(self, *args, **kwargs):
        """
        Add following variables under 'trainer_mixin_args' through the training
        arguments.

        :param teacher_model_names_or_paths: List of pretrained model names or paths to
                                             use as teachers in knowledge distillation.
        :param teacher_models_cache_dir: (optional) directory to load and save
                                         pre-trained teacher models
        :param kd_ensemble_weights: List of weights to apply to each teacher model
                                during distillation.
                                If the total is > 1 the loss will be scaled out
                                of proportion, acting in practice as a scaling factor
                                to the learning rate (the equivalence is true
                                in the composite loss model, and only approximate
                                for the regular distillation model. Scaling the
                                softmax out of proportion creates a target that
                                is impossible to reach, since the output distribution
                                can only sum to 1)
        :param kd_factor_init: Determines the percentage of the target that comes
                            from the teacher model. Value should be float
                            between 0 and 1. Defaults to 1.
        :param kd_factor_end: KD factor at last epoch. Will calculate linear decay
                            based on initial kd_factor_init and kd_factor_end.
                            Value should be float between 0 and 1.
                            If None, no decay is applied. Defaults to None.
        :param kd_temperature_init: Determines the temperature T applied to softmax.
                                If T > 1, it smoothes the softmax distribution.
                                If T < 1, it sharpens the distribution (more mass to
                                few points). If kd_temperature_end is also defined,
                                this variable equals the temperature at the beginning
                                of training. Defaults to 1.0
        :param kd_temperature_end: Determines the temperature applied to softmax.
                                Will calculate linear decay based on
                                kd_temperature_init and kd_temperature_end.
                                If None, no decay is applied. Defaults to None.
        """

        super().__init__(*args, **kwargs)

        mixin_args = self.args.trainer_mixin_args

        teacher_names_or_paths = mixin_args.get("teacher_model_names_or_paths",
                                                None)
        teacher_models_cache_dir = mixin_args.get("teacher_models_cache_dir",
                                                  None)
        kd_ensemble_weights = mixin_args.get("kd_ensemble_weights", None)
        kd_factor_init = mixin_args.get("kd_factor_init", 1.0)
        kd_factor_end = mixin_args.get("kd_factor_end", 1.0)
        kd_temperature_init = mixin_args.get("kd_temperature_init", 1.0)
        kd_temperature_end = mixin_args.get("kd_temperature_end", 1.0)

        # Validate teacher models
        assert (
            isinstance(teacher_names_or_paths, list)
            and len(teacher_names_or_paths) > 0
        ), "When using KD mixin, teacher_model_names_or_paths must be defined"

        seq_length = get_model_seq_length(self.model)
        teacher_models = []
        for model_name_or_path in teacher_names_or_paths:
            teacher_model = AutoModelForMaskedLM.from_pretrained(
                model_name_or_path, cache_dir=teacher_models_cache_dir)
            if self.args.fp16:
                teacher_model.half()
            teacher_model.resize_token_embeddings(len(self.tokenizer))
            teacher_model = resize_position_embeddings(teacher_model,
                                                       seq_length)
            teacher_model = teacher_model.eval().to(self.args.device)

            # Use deepspeed inference mode on teacher models
            if self.args.deepspeed:
                ds_engine = deepspeed.init_inference(teacher_model,
                                                     dtype=torch.half,
                                                     replace_method="auto")
                teacher_model = ds_engine.module

            teacher_models.append(teacher_model)

        if len(teacher_models) == 1:
            logging.info(
                f"KD single teacher class: {teacher_models.__class__}")
        else:
            logging.info(
                f"KD teacher is ensemble of {len(teacher_models)} models")

        self.teacher_models = teacher_models

        # Validate knowledge Distillation factor
        assert 0 <= kd_factor_init <= 1, "kd_factor_init should be >= 0 and <= 1"
        assert 0 <= kd_factor_end <= 1, "kd_factor_end should be >= 0 and <= 1"
        logging.info(f"KD factor: {kd_factor_init} {kd_factor_end}")

        # Validate Knowledge softmax temperature factor
        logging.info(
            f"KD softmax temperature: {kd_temperature_init} {kd_temperature_end}"
        )

        # Validate ensemble weighting
        num_models = len(teacher_models)
        if kd_ensemble_weights is None:
            kd_ensemble_weights = [1.0 / num_models for _ in range(num_models)]
        else:
            assert (
                len(kd_ensemble_weights) == num_models
            ), "Number of ensemble weights should match number of teacher models"
        logging.info(f"Ensemble weights: {kd_ensemble_weights}")

        # Initialize KD as a label smoother
        self.label_smoother = KDLoss(
            num_classes=list(self.model.parameters())[-1].size()[0],
            kd_ensemble_weights=kd_ensemble_weights,
            kd_factor_init=kd_factor_init,
            kd_factor_end=kd_factor_end,
            kd_temperature_init=kd_temperature_init,
            kd_temperature_end=kd_temperature_end,
        )
Ejemplo n.º 12
0
import os
import torch
import deepspeed
import transformers

from deepspeed import module_inject
from transformers import pipeline
from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock as gpt2_transformer

# Get local gpu rank from torch.distributed/deepspeed launcher
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

print(
    "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************"
    .format(local_rank,
            world_size))
generator = pipeline('text-generation',
                     model='EleutherAI/gpt-neo-2.7B',
                     device=local_rank)
generator.model = deepspeed.init_inference(generator.model,
                                           mp_size=world_size,
                                           dtype=torch.float,
                                           replace_method='auto',
                                           replace_with_kernel_inject=True)
string = generator("DeepSpeed is", do_sample=True, min_length=50)
print(string)
Ejemplo n.º 13
0
    def test(
        self,
        model_w_task,
        dtype,
        enable_cuda_graph,
        query,
        inf_kwargs,
        assert_fn,
        invalid_model_task_config,
    ):
        if invalid_model_task_config:
            pytest.skip(invalid_model_task_config)

        model, task = model_w_task
        local_rank = int(os.getenv("LOCAL_RANK", "0"))

        if "gpt-j-6B" in model and dtype == torch.half:
            _model = AutoModelForCausalLM.from_pretrained(model)
            tokenizer = AutoTokenizer.from_pretrained(model)
            _model.half()
            pipe = pipeline(
                task,
                model=_model,
                tokenizer=tokenizer,
                device=local_rank,
                framework="pt",
            )
        else:
            pipe = pipeline(task,
                            model=model,
                            device=local_rank,
                            framework="pt")
            if dtype == torch.half:
                pipe.model.half()

        # Warm-up queries for perf measurement
        #for i in range(10):
        #    _ = pipe(query, **inf_kwargs)
        torch.cuda.synchronize()
        start = time.time()
        bs_output = pipe(query, **inf_kwargs)
        torch.cuda.synchronize()
        bs_time = time.time() - start

        pipe.model = deepspeed.init_inference(
            pipe.model,
            mp_size=1,
            dtype=dtype,
            replace_method="auto",
            replace_with_kernel_inject=True,
            enable_cuda_graph=enable_cuda_graph,
        )
        # Warm-up queries for perf measurement
        #for i in range(10):
        #    _ = pipe(query, **inf_kwargs)
        torch.cuda.synchronize()
        start = time.time()
        ds_output = pipe(query, **inf_kwargs)
        torch.cuda.synchronize()
        ds_time = time.time() - start

        if task == "text-generation":
            bs_output = pipe(query, **inf_kwargs)

        # These performance tests are only measuring the time for a single
        # inference request, we just want to check that performance isn't terrible
        #assert ds_time <= (bs_time * 1.1)
        assert assert_fn(bs_output, ds_output)