Ejemplo n.º 1
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")
    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.model_max_length
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size,
                                   tokenizer.model_max_length)

    # Get datasets

    train_dataset = (get_dataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args,
                                tokenizer=tokenizer,
                                evaluate=True,
                                cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        if data_args.mlm and data_args.whole_word_mask:
            data_collator = DataCollatorForWholeWordMask(
                tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
        else:
            data_collator = DataCollatorForLanguageModeling(
                tokenizer=tokenizer,
                mlm=data_args.mlm,
                mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Ejemplo n.º 2
0
    ds=data_loader.dataset
    sum=0
    for i, (seq1,seq2,mask1,mask2) in enumerate(tqdm(data_loader)):
        input_ids1, attention_mask1= split_seq(seq1.to(device)),split_seq(mask1.to(device))
        input_ids2, attention_mask2= split_seq(seq2.to(device)),split_seq(mask2.to(device))
        with torch.no_grad():
            for i in range(len(input_ids1)):
                input,label=ds.DataCollatorForLanguageModeling(input_ids1[i])
                output=net(input_ids=input,attention_mask=attention_mask1[i],labels=label)
                sum+=output.loss.cpu().numpy()

            for i in range(len(input_ids2)):
                input,label=ds.DataCollatorForLanguageModeling(input_ids2[i])
                output=net(input_ids=input,attention_mask=attention_mask2[i],labels=label)
                sum+=output.loss.cpu().numpy()
    fi=open("loge5.txt","a")
    print("eval loss ",sum,file=fi)
    fi.close()
config = AutoConfig.from_pretrained('bert-base-cased')
config.vocab_size=total_dataset.MAXTOKEN+10
model = AutoModelForMaskedLM.from_config(config).to(device)
optim = AdamW(model.parameters(), lr=1e-5)

test(model,test_loader)
for i in range(50):
    print("start our training")
    train(model,train_loader,optim,i)
    print("start eval")
    test(model,test_loader)
    savemodel(model,i)
Ejemplo n.º 3
0
    def test_trainer_iterable_dataset(self):
        # Simulate Language Modeling with an IterableDataset, with no __len__ method
        # Pick-up a tiny model, so it works on CPU
        # See Issue #5990: https://github.com/huggingface/transformers/issues/5990
        MODEL_ID = "sshleifer/tiny-distilbert-base-cased"
        model = AutoModelForMaskedLM.from_pretrained(MODEL_ID)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        train_dataset = SampleIterableDataset(file_path=PATH_SAMPLE_TEXT,
                                              tokenizer=tokenizer)
        training_args = TrainingArguments(output_dir="./examples",
                                          no_cuda=True,
                                          max_steps=2)
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=0.15)

        training_args = TrainingArguments(output_dir="./examples",
                                          no_cuda=True,
                                          max_steps=2)
        trainer = Trainer(model=model,
                          args=training_args,
                          train_dataset=train_dataset,
                          data_collator=data_collator)
        trainer.train()

        loader = trainer.get_train_dataloader()
        self.assertIsInstance(loader, torch.utils.data.DataLoader)
        self.assertIsInstance(
            loader.sampler,
            torch.utils.data.dataloader._InfiniteConstantSampler)

        # Exception if giving iterable dataset and no max_steps
        with self.assertRaises(ValueError):
            training_args = TrainingArguments(output_dir="./examples",
                                              no_cuda=True)
            _ = Trainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset,
                        data_collator=data_collator)

        # Exception if eval_dataset is iterable in __init__
        with self.assertRaises(ValueError):
            training_args = TrainingArguments(output_dir="./examples",
                                              no_cuda=True,
                                              max_steps=2)
            _ = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=train_dataset,
                data_collator=data_collator,
            )

        # Exception if predicting with iterable dataset
        with self.assertRaises(ValueError):
            training_args = TrainingArguments(output_dir="./examples",
                                              no_cuda=True)
            trainer = Trainer(model=model,
                              args=training_args,
                              data_collator=data_collator)
            trainer.predict(train_dataset)

        # Exception if evaluating with iterable dataset
        with self.assertRaises(ValueError):
            training_args = TrainingArguments(output_dir="./examples",
                                              no_cuda=True)
            trainer = Trainer(model=model,
                              args=training_args,
                              data_collator=data_collator)
            trainer.evaluate(train_dataset)
Ejemplo n.º 4
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    padding = "max_length" if data_args.pad_to_max_length else False

    def tokenize_function(examples):
        # Remove empty lines
        examples["text"] = [
            line for line in examples["text"]
            if len(line) > 0 and not line.isspace()
        ]
        return tokenizer(examples["text"],
                         padding=padding,
                         truncation=True,
                         max_length=data_args.max_seq_length)

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=[text_column_name],
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Add the chinese references if provided
    if data_args.train_ref_file is not None:
        tokenized_datasets["train"] = add_chinese_references(
            tokenized_datasets["train"], data_args.train_ref_file)
    if data_args.validation_ref_file is not None:
        tokenized_datasets["validation"] = add_chinese_references(
            tokenized_datasets["validation"], data_args.validation_ref_file)

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForWholeWordMask(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path if
                      (model_args.model_name_or_path is not None
                       and os.path.isdir(model_args.model_name_or_path)) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_mlm_wwm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Ejemplo n.º 5
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warn(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if args.max_seq_length > tokenizer.model_max_length:
            logger.warn(
                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    if args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name],
                             return_special_tokens_mask=True)

        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            load_from_cache_file=not args.overwrite_cache,
        )

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=args.mlm_probability)

    # DataLoaders creation:
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses.append(
                accelerator.gather(loss.repeat(
                    args.per_device_eval_batch_size)))

        losses = torch.cat(losses)
        losses = losses[:len(eval_dataset)]
        perplexity = math.exp(torch.mean(losses))

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
def analyze():
    data = pd.read_csv(
        '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/raw/rawdata.csv'
    )
    print('...lets read the tweets saved...')
    # change date type from 'object' to 'date'
    data['date'] = pd.to_datetime(data['date'])
    # getting today's Timestamp
    today = pd.Timestamp.today().floor('D')
    # .normalize() does the same thing
    data = data[(data['date'] > today)]
    # select required columns
    data = data.drop(columns=['Unnamed: 0'])
    # data analysis => sorting
    data = data.sort_values('user_name', ascending=False)
    data = data[data.user_name != 'BiciMAD']
    data = data.reset_index()
    data = data.drop(columns=['index'])

    def clean_tweet(tweet):
        return ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   tweet).split())
        #return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet).split())

    print('...and clean them a bit...')

    # Updated the tweets_clean
    data['tweets_clean'] = data['text'].apply(clean_tweet)
    print('tweets cleaned!...')

    from transformers import pipeline
    classifier = pipeline('sentiment-analysis')
    from transformers import AutoTokenizer, AutoModelForMaskedLM
    tokenizer = AutoTokenizer.from_pretrained(
        "dccuchile/bert-base-spanish-wwm-cased")
    model = AutoModelForMaskedLM.from_pretrained(
        "dccuchile/bert-base-spanish-wwm-cased")
    print('...sentiment analysis model from transformers there...')

    def transform(x):
        return classifier(x)

    # Apply transform function to all tweets
    data['sentiment'] = data['tweets_clean'].apply(transform)
    print('TODAYs tweets with sentiment analysis done!...')

    data["score"] = [
        data["sentiment"][i][0]['score'] for i in range(data.shape[0])
    ]
    data["label"] = [
        data["sentiment"][i][0]['label'] for i in range(data.shape[0])
    ]
    score = data['score']
    positive = (data["label"] == "POSITIVE")
    negative = (data["label"] == "NEGATIVE")
    data['label_coded'] = data['label'].apply(lambda x: 1
                                              if x == 'POSITIVE' else -1)
    data['score_coded'] = data['label_coded'] * data['score']
    df_old = pd.read_csv(
        '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/results/data_sentiment.csv'
    )
    df_old = df_old.astype(str)
    df_str = data.astype(str)
    df = pd.merge(df_old, df_str, how='outer')
    df = df[df.date != 'date']
    df.drop_duplicates(subset=['id'], keep='last', inplace=True)
    df.reset_index()
    # check new Tweets are in df
    df.sort_values('date', ascending=False).head(10)
    # save to csv - add a dataframe to an existing csv file
    df.to_csv(
        '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/results/data_sentiment.csv',
        header=True)
    print('TODAYs tweets with sentiment label and score saved!...')
Ejemplo n.º 7
0
import torch
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM, AutoModelForPreTraining
from pathlib import Path
import os

import torch
print(torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMaskedLM.from_pretrained(
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

model.train()

from transformers import AutoModelForMaskedLM, LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../results_file_clean.txt",
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
Ejemplo n.º 8
0
if GET_PERPLEXITY == 'yes':

    logging.info('Calculating perplexity')
    race_df = pd.read_csv(data_path + demo + '/' + input_file_1)
    race_df_2 = pd.read_csv(data_path + demo + '/' + input_file_2)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

    if debiasing_head:
        logging.info('Loading debiased model..')
        model = AutoModelWithLMAndDebiasHead.from_pretrained(
            pretrained_model, debiasing_head=debiasing_head)
    else:
        if 'bert' in args.model_path.__repr__().lower():
            logging.info('in bert')
            model = AutoModelForMaskedLM.from_pretrained(pretrained_model)
        elif 'gpt' in pretrained_model.__repr__().lower():
            logging.info('in gpt')
            model = AutoModelForCausalLM.from_pretrained(pretrained_model)
        else:
            logging.info('in CLM model by default')
            model = AutoModelForCausalLM.from_pretrained(pretrained_model)

    race_1_perplexity = get_perplexity_list(race_df, model, tokenizer)
    logging.info('Done with demo1 perplexity in {} on set'.format(
        (time.time() - start) / 60))
    race_2_perplexity = get_perplexity_list(race_df_2, model, tokenizer)
    logging.info('Done with demo2 perplexity in {} on set'.format(
        (time.time() - start) / 60))

    race_df['perplexity'] = race_1_perplexity
Ejemplo n.º 9
0
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model,
        clustering_loss_weight, embedding_extractor, annealing_alphas, dataset,
        val_dataset, result_dir, early_stopping, early_stopping_tol, device,
        random_state):
    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    # load data
    train_df = pd.read_csv(dataset)

    train_texts = train_df['texts'].to_numpy()
    train_labels = train_df['labels'].to_numpy()

    train_data = TextDataset(train_texts, train_labels)
    train_data_loader = DataLoader(dataset=train_data,
                                   batch_size=train_batch_size,
                                   shuffle=False)

    val_df = pd.read_csv(val_dataset)

    val_texts = val_df['texts'].to_numpy()
    val_labels = val_df['labels'].to_numpy()

    val_data = TextDataset(val_texts, val_labels)
    val_data_loader = DataLoader(dataset=val_data,
                                 batch_size=val_batch_size,
                                 shuffle=False)

    # init lm model & tokenizer
    lm_model = AutoModelForMaskedLM.from_pretrained(base_model,
                                                    return_dict=True,
                                                    output_hidden_states=True)
    tokenizer = AutoTokenizer.from_pretrained(base_model,
                                              return_dict=True,
                                              output_hidden_states=True)

    lm_model.to(device)

    # init clustering model
    model, initial_centroids, initial_embeddings = init_model(
        lm_model=lm_model,
        tokenizer=tokenizer,
        data_loader=train_data_loader,
        embedding_extractor=embedding_extractor,
        n_clusters=np.unique(train_labels).shape[0],
        device=device)

    # init optimizer & scheduler
    opt = torch.optim.RMSprop(
        params=model.parameters(),
        lr=lr,  # 2e-5, 5e-7,
        eps=1e-8)

    total_steps = len(train_data_loader) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer=opt,
        num_warmup_steps=int(len(train_data_loader) * 0.5),
        num_training_steps=total_steps)

    # train the model
    hist = train(n_epochs=n_epochs,
                 model=model,
                 optimizer=opt,
                 scheduler=scheduler,
                 annealing_alphas=annealing_alphas,
                 train_data_loader=train_data_loader,
                 eval_data_loader=val_data_loader,
                 clustering_loss_weight=clustering_loss_weight,
                 early_stopping=early_stopping,
                 early_stopping_tol=early_stopping_tol,
                 verbose=True)
    # do eval
    run_results = {}

    predicted_labels, true_labels = evaluate(model=model,
                                             eval_data_loader=val_data_loader,
                                             verbose=True)

    best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels)
    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    purity = purity_score(y_true=true_labels, y_pred=predicted_labels)

    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results[
        'purity'] = purity  # use purity to compare with microsoft paper

    # save train hist
    os.makedirs(result_dir, exist_ok=True)

    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, '20_newsgroups-distilbert.csv'),
                     index=False)

    # save results & model
    os.makedirs(result_dir, exist_ok=True)
    with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f:
        pickle.dump(hist, file=f)

    torch.save(model, os.path.join(result_dir, 'model.bin'))
import torch
import sys
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM
import numpy as np
from tqdm import tqdm


model = AutoModelForMaskedLM.from_pretrained("distilbert-base-multilingual-cased")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.load_state_dict(torch.load('distilbert_chat', map_location=device))

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

sentence = ' '.join(sys.argv[1:])
r = np.random.randint(low=2, high=5)
for _ in range(r):
	t = fill_mask(sentence + '[MASK]')
	sentence += ' ' + t[0]['token_str']

print(sentence)

Ejemplo n.º 11
0
def main():
    # 在 src/transformers/training_args.py中查看所有可能的参数,或将--help标志传递给此脚本。
    # 现在,我们保留了不同的参数集,以使关注点更加清晰。
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # 如果我们仅将一个参数传递给脚本,并且它是指向json文件的路径,那么让我们对其进行解析以获取参数。
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"输出目录({training_args.output_dir}) 以及存在,并且不为空"
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # 记录每个进程的日志
    logger.warning(
        f"使用的 rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} "
        + f"是否分布式训练: {bool(training_args.local_rank != -1)}, 16-bits 半精度训练: {training_args.fp16}"
    )
    # 主进程的日志设为verbosity:
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("训练/评估参数 %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # 在分布式训练中,load_dataset函数可确保只有一个本地进程可以同时下载数据集。
    if data_args.dataset_name is not None:
        # 从hub下载和加载数据集。
        # 首先确定本地缓存了cache文件
        cache_script = os.path.join("data", data_args.dataset_name+".py")
        if not os.path.exists(cache_script):
            raise Exception("请检查本地是否存在相关脚本文件")
        datasets = load_dataset(path=cache_script, name=data_args.dataset_config_name, data_dir=data_args.data_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # 加载预训练模型和tokenizer
    #
    # Distributed training:
    # .from_pretrained方法可确保只有一本地个进程可以同时下载模型和vocab。
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("你正从头开始初始化一个新的config.")
    # tokenizer的设置
    if model_args.tokenizer_name:
        if model_args.tokenizer_name == "myroberta":
            tokenizer = BertTokenizer.from_pretrained(
                model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
            )
        else:
            tokenizer = AutoTokenizer.from_pretrained(
                model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
            )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "您正在从头实例化一个新的tokenizer。 此脚本不支持此功能。 "
            "您可以用其它形式训练好之后,在这里使用,使用方法:  using --tokenizer_name."
        )
    #模型的设置
    if model_args.model_name_or_path:
        if model_args.model_name_or_path == 'myroberta':
            model = RobertaForMaskedLM.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
        else:
            model = AutoModelForMaskedLM.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
    else:
        logger.info("从头开始训练一个模型")
        model = AutoModelForMaskedLM.from_config(config)
    #重设下tokenizer的大小,如果当我们从头训练新模型时,这是必须的
    model.resize_token_embeddings(len(tokenizer))

    # 处理数据集
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.line_by_line:
        # 按行处理, tokenize each nonempty line
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # 移除空行
            # 收到的数据长度
            print(f"收到的数据长度: {[len(t) for t in examples['text']]}")
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
            tokenizer_res = tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=data_args.max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )
            print(f"tokenizer之后的数据长度: {print([len(t) for t in tokenizer_res['input_ids']])}")
            return tokenizer_res

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # 否则,我们将tokenize每个文本,然后将它们拼接在一起,然后再将它们分成较小的部分。
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
        #默认一次处理1000行
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        if data_args.max_seq_length is None:
            max_seq_length = tokenizer.model_max_length
        else:
            if data_args.max_seq_length > tokenizer.model_max_length:
                logger.warning(
                    f"参数给定的 max_seq_length  ({data_args.max_seq_length}) 比模型的 ({tokenizer.model_max_length}) 最大长度长. 使用模型的最大长度 max_seq_length={tokenizer.model_max_length}."
                )
            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

        # 主要数据处理功能,可拼接数据集中的所有文本并生成max_seq_length的块。
        def group_texts(examples):
            # 拼接所有文本。
            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # 我们删除一小部分,如果模型支持该字段,则可以添加padding,而不是删除,您可以根据需要自定义此部分。
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
                for k, t in concatenated_examples.items()
            }
            return result

        # 注意,使用batched=True`时,此映射一起处理1,000个文本,因此group_texts会丢弃这1,000个文本组中的每一个的余数。 您可以在此处调整该batch_size,但较高的值可能会较慢进行预处理。
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # 这部分是随机mask token的设置
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** 开始评估 ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Ejemplo n.º 12
0
def main(train_function):

    # ----- Parse local_rank for torch.distributed.launch -----------

    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", type=int)
    local_rank = parser.parse_args().local_rank
    if local_rank is None:
        local_rank = 0

    # ----- Setup logging -----------

    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(local_rank) else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()

    # ----- Configurable Params -----------

    # List of dicts with configuration for each dataset to be loaded
    # see available datasets in the Hub: https://huggingface.co/datasets. sizes
    # are of generated dataset, can be an order of magnitude larger after tokenization.
    # Not all datasets can be concatenated without preprocessing, features must align
    datasets_args = [
        dict(path="wikitext", name="wikitext-2-raw-v1"),  # 12.91 MB
        # dict(path="wikitext", name="wikitext-103-raw-v1"),  # 524 MB
        # dict(path="ptb_text_only"), # 5.7 MB
        # dict(path="bookcorpus"),  # 4.63 GB
        # dict(path="wikipedia"),  # 35.38 GB
    ]

    # Training params
    # note: in V100 bs=8 uses 11/16 of available gpu mem, bs=12 uses 15/16

    output_dir = os.path.expanduser("~/nta/results/bert")
    training_args = TrainingArguments(
        # Logging
        output_dir=output_dir,
        logging_first_step=True,
        logging_steps=10,  # also define eval_steps
        eval_steps=10,
        max_steps=30,  # num_train_epochs replaced by steps
        disable_tqdm=True,
        run_name="debug_run",  # used for wandb, not for Ray
        # hyperparams
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=1e-4,
        lr_scheduler_type="linear",
        warmup_steps=500,
        weight_decay=1e-6,
    )

    # Evaluate refers to evaluating perplexity on trained model in the validation set
    # doesn't refer to finetuning and evaluating on downstream tasks such as GLUE
    seed = random.randint(0, 1000000)

    # Changing the tokenizer will result in re-tokenizing the dataset.
    # As a reference, BERT tokenization will take ~ 3 hours for a 5GB dataset
    config_class = BertConfig
    tokenizer_name = "bert-base-cased"

    # ----- Seed -----------

    set_seed(seed)
    print(f"Seed to reproduce: {seed}")

    # ----- Dataset -----------

    # Load multiple datasets and concatenate.
    # using only 'train' and 'validation' sets, could also include 'test'
    # if no split is defined, load_dataset returns DatasetDict with all available splits
    train_datasets = [load_dataset(**args, split="train") for args in datasets_args]
    val_datasets = [load_dataset(**args, split="validation") for args in datasets_args]

    dataset = DatasetDict()
    dataset["train"] = concatenate_datasets(train_datasets)
    dataset["validation"] = concatenate_datasets(val_datasets)

    def load_and_split_dataset(dataset_args, split_percentage=5):
        """Alternative: if no validation set available, manuallly split the train set"""

        dataset = DatasetDict()
        dataset["train"] = load_dataset(
            **dataset_args, split=f"train[{split_percentage}%:]"
        )
        dataset["validation"] = load_dataset(
            **dataset_args, split=f"train[:{split_percentage}%]"
        )
        return dataset

    # ----- Load Model -----------

    # Load model
    config = config_class()
    model = AutoModelForMaskedLM.from_config(config)

    # Load tokenizer
    # use_fast falls back to tokenizer lib implementation under the hood
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
    model.resize_token_embeddings(len(tokenizer))

    # ----- Preprocess dataset -----------

    # Only use the text column name when doing language modeling
    # this feature might have a different name depending on the dataset
    # might need to change column names prior to concatenating, if that is the case
    column_names = dataset["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    # Setting overwrite_cache to False will retokenize the dataset.
    # do not overwrite cache if using shared cache repository.
    overwrite_cache = False
    preprocessing_num_workers = None

    # We tokenize every text, then concatenate them together before splitting in smaller
    # parts. We use `return_special_tokens_mask=True` given
    # DataCollatorForLanguageModeling is more efficient when it
    # receives the `special_tokens_mask`.
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=column_names,
        num_proc=preprocessing_num_workers,
        load_from_cache_file=not overwrite_cache,
    )

    # Main data processing function that will concatenate all texts from our dataset and
    # generate chunks of max_seq_length.
    max_seq_length = tokenizer.model_max_length

    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it
        # instead of this drop, you can customize this part to your needs.
        total_length = (total_length // max_seq_length) * max_seq_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_seq_length] for i in
                range(0, total_length, max_seq_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so
    # group_texts throws away a remainder for each of those groups of 1,000 texts.
    # You can adjust batch_size here but a higher value will be slower to preprocess.
    tokenized_dataset = tokenized_dataset.map(
        group_texts,
        batched=True,
        num_proc=preprocessing_num_workers,
        load_from_cache_file=not overwrite_cache,
    )

    # Data collator
    # This one will take care of randomly masking the tokens.
    # Q: what about dynamic masking, used in Roberta?
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=0.15
    )

    # ----- Setup Trainer -----------

    # Initialize Trainer. Similar to Vernon's Experiment class.
    # dataloader and training loop are contained in Trainer abstraction
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # ----- Functions to train and evaluate -----------

    if train_function == "huggingface":
        # Tested
        run_hf(trainer, logger, output_dir, save_model=True, evaluate=True)

    elif train_function == "ray_single_node":
        # Tested
        run_ray_single_instance(
            trainer,
            logger,
            name="bert_test",
            config=None,
            num_samples=1,
            local_dir=os.path.expanduser("~/nta/results/experiments/transformers"),
            keep_checkpoints_num=1,
            resources_per_trial={"cpu": 8},
            # note: checkpoint arguments cannot be used with a checkpointable function
        )

    elif train_function == "ray_multiple_nodes":
        # Untested
        run_ray_distributed(
            trainer,
            logger,
            name="bert_test",
            config=None,
            num_samples=1,
            local_dir=os.path.expanduser("~/nta/results/experiments/transformers"),
            keep_checkpoints_num=1,
            queue_trials=True,
            verbose=2,
            resources_per_trial={"gpu": 4},
        )
Ejemplo n.º 13
0
def main():
    """
    Collect XLM-R representations from corpus.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        required=True,
        help='path to model directory or model name (e.g., xlm-roberta-base)')
    parser.add_argument(
        '--targets_path',
        type=str,
        required=True,
        help=
        'Path to file with target words (one word per line — possibly with tab-separated change score — '
        'or a list of comma-separated word forms.')
    parser.add_argument('--output_path',
                        type=str,
                        required=True,
                        help='Output path for extracted embeddings.')
    parser.add_argument(
        '--corpus_path',
        type=str,
        required=True,
        help='Path to corpus or corpus directory (iterates through files).')
    parser.add_argument('--context_window',
                        type=int,
                        default=512,
                        help="The length of a token's entire context window")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=64,
        help='The number of sentences processed at once by the LM.')
    # parser.add_argument(
    #     '--n_layers', type=int, default=12,
    #     help='The number of layers of the Transformer model.'
    # )
    parser.add_argument(
        '--n_dims',
        type=int,
        default=768,
        help=
        'The dimensionality of a Transformer layer (hence the dimensionality of the output embeddings).'
    )
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        help='For distributed training (default: -1).')

    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.info(__file__.upper())
    start_time = time.time()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        n_gpu = 1

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank, device, n_gpu, bool(args.local_rank != -1))

    # Set seeds across modules
    set_seed(42, n_gpu)

    # Load targets
    targets = defaultdict(list)
    with open(args.targets_path, 'r', encoding='utf-8') as f_in:
        for line in f_in.readlines():
            line = line.strip()
            forms = line.split(',')
            if len(forms) > 1:
                for form in forms:
                    if form not in targets[forms[0]]:
                        targets[forms[0]].append(form)
            else:
                line = line.split('\t')
                targets[line[0]].append(line[0])

    n_target_forms = sum([len(vals) for vals in targets.values()])
    logger.warning(f"Target lemmas: {len(targets)}.")
    logger.warning(f"Target word forms: {n_target_forms}.")

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path)  #, never_split=targets)
    model = AutoModelForMaskedLM.from_pretrained(args.model_name_or_path,
                                                 output_hidden_states=True)

    logger.warning(f"Tokenizer's added tokens:\n{tokenizer.get_added_vocab()}")

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    # Store vocabulary indices of target words
    targets_ids = defaultdict(lambda: dict())
    for lemma in targets:
        for form in targets[lemma]:
            targets_ids[lemma][form] = tokenizer.encode(
                form, add_special_tokens=False)

    assert n_target_forms == sum([len(vals) for vals in targets_ids.values()])

    ids2lemma = {}  # maps all forms' token ids to their corresponding lemma
    lemma2ids = defaultdict(
        list
    )  # maps every lemma to a list of token ids corresponding to all word forms
    len_longest_tokenized = 0

    for lemma, forms2ids in targets_ids.items():
        for form, form_id in forms2ids.items():

            # remove '▁' from the beginning of subtoken sequences
            if len(form_id) > 1 and form_id[0] == 6:
                form_id = form_id[1:]

            if len(form_id) == 0:
                logger.warning(
                    'Empty string? Lemma: {}\tForm:"{}"\tTokenized: "{}"'.
                    format(lemma, form, tokenizer.tokenize(form)))
                continue

            if len(form_id) == 1 and form_id[0] == tokenizer.unk_token_id:
                logger.warning('Tokenizer returns UNK for this word form. '
                               'Lemma: {}\tForm: {}\tTokenized: {}'.format(
                                   lemma, form, tokenizer.tokenize(form)))
                continue

            if len(form_id) > 1:
                logger.warning('Word form split into subtokens. '
                               'Lemma: {}\tForm: {}\tTokenized: {}'.format(
                                   lemma, form, tokenizer.tokenize(form)))

            ids2lemma[tuple(form_id)] = lemma
            lemma2ids[lemma].append(tuple(form_id))
            if len(tuple(form_id)) > len_longest_tokenized:
                len_longest_tokenized = len(tuple(form_id))

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Get sentence iterator
    sentences = PathLineSentences(args.corpus_path)

    nSentences = 0
    target_counter = {target: 0 for target in lemma2ids}
    for sentence in sentences:
        nSentences += 1
        sentence_token_ids = tokenizer.encode(' '.join(sentence),
                                              add_special_tokens=False)

        while sentence_token_ids:
            candidate_ids_found = False
            for length in list(range(1, len_longest_tokenized + 1))[::-1]:
                candidate_ids = tuple(sentence_token_ids[-length:])
                if candidate_ids in ids2lemma:
                    target_counter[ids2lemma[candidate_ids]] += 1
                    sentence_token_ids = sentence_token_ids[:-length]
                    candidate_ids_found = True
                    break
            if not candidate_ids_found:
                sentence_token_ids = sentence_token_ids[:-1]

    logger.warning('Total usages: %d' % (sum(list(target_counter.values()))))

    for lemma in target_counter:
        logger.warning(f'{lemma}: {target_counter[lemma]}')

    # Container for usages
    usages = {
        target: np.empty((target_count, args.n_dims))  # usage matrix
        for (target, target_count) in target_counter.items()
    }

    # Iterate over sentences and collect representations
    nUsages = 0
    curr_idx = {target: 0 for target in target_counter}

    def collate(batch):
        return [{
            'input_ids':
            torch.cat([item[0]['input_ids'] for item in batch], dim=0),
            'attention_mask':
            torch.cat([item[0]['attention_mask'] for item in batch], dim=0)
        }, [item[1] for item in batch], [item[2] for item in batch]]

    dataset = ContextsDataset(ids2lemma, sentences, args.context_window,
                              tokenizer, len_longest_tokenized, nSentences)
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset,
                            sampler=sampler,
                            batch_size=args.batch_size,
                            collate_fn=collate)
    iterator = tqdm(dataloader,
                    desc="Iteration",
                    disable=args.local_rank not in [-1, 0])

    for step, batch in enumerate(iterator):
        model.eval()
        batch_tuple = tuple()
        for t in batch:
            try:
                batch_tuple += (t.to(device), )
            except AttributeError:
                batch_tuple += (t, )

        batch_input_ids = batch_tuple[0]
        batch_lemmas, batch_spos = batch_tuple[1], batch_tuple[2]

        with torch.no_grad():
            if torch.cuda.is_available():
                batch_input_ids['input_ids'] = batch_input_ids['input_ids'].to(
                    'cuda')
                batch_input_ids['attention_mask'] = batch_input_ids[
                    'attention_mask'].to('cuda')

            outputs = model(**batch_input_ids)

            if torch.cuda.is_available():
                hidden_states = [
                    l.detach().cpu().clone().numpy()
                    for l in outputs.hidden_states
                ]
            else:
                hidden_states = [
                    l.clone().numpy() for l in outputs.hidden_states
                ]

            # store usage tuples in a dictionary: lemma -> (vector, position)
            for b_id in np.arange(len(batch_lemmas)):
                lemma = batch_lemmas[b_id]
                layers = [
                    layer[b_id, batch_spos[b_id][0]:batch_spos[b_id][1], :]
                    for layer in hidden_states
                ]
                usage_vector = np.mean(layers, axis=0)
                if usage_vector.shape[0] > 1:
                    usage_vector = np.mean(usage_vector, axis=0)
                usages[lemma][curr_idx[lemma], :] = usage_vector

                curr_idx[lemma] += 1
                nUsages += 1

    iterator.close()
    np.savez_compressed(args.output_path, **usages)

    logger.warning('Total embeddings: %d' % (nUsages))
    logger.warning("--- %s seconds ---" % (time.time() - start_time))
    def __init__(
        self,
        vocab_path="",
        model_name="bert-base-cased",
        max_edit_dist=10,
        debug=False,
        performance=False,
    ):
        """To create an object for this class. It does not require any special

        Args:
            vocab_path (str, optional): Vocabulary file path to be used by the
                                         model . Defaults to "".
            model_name (str, optional): Pretrained BERT model name. Defaults to
                                        "bert-base-cased".
            max_edit_dist (int, optional): Maximum edit distance between two
                                           words. Defaults to 10.
            debug (bool, optional): This help prints logs as the data flows
                                     through the class. Defaults to False.
            performance (bool, optional): This is used to print the time taken
                                          by individual steps in spell check.
                                          Defaults to False.
        """
        if ((type(vocab_path) != type("")) or (type(debug) != type(True))
                or (type(performance) != type(True))):
            raise TypeError(
                "Please check datatype provided. vocab_path should be str,"
                " debug and performance should be bool")

        if vocab_path != "":
            try:
                # First open() for user specified word addition to vocab
                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [
                    #     line.rstrip()
                    #     for line in f
                    #     if not line.startswith("[unused")
                    # ]
                    words = [line.strip() for line in f]

                # The below code adds the necessary words like numbers
                # /punctuations/tokenizer specific words like [PAD]/[
                # unused0]/##M
                current_path = os.path.dirname(__file__)
                vocab_path = os.path.join(current_path, "data", "vocab.txt")
                extra_token = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
                words.extend(extra_token)

                with open(vocab_path, encoding="utf8") as f:
                    # if want to remove '[unusedXX]' from vocab
                    # words = [
                    #     line.rstrip()
                    #     for line in f
                    #     if not line.startswith("[unused")
                    # ]
                    for line in f:
                        extra_token = line.strip()
                        if extra_token.startswith("[unused"):
                            words.append(extra_token)
                        elif extra_token.startswith("##"):
                            words.append(extra_token)
                        elif len(extra_token) == 1:
                            words.append(extra_token)
                if debug:
                    debug_file_path = os.path.join(current_path, "tests",
                                                   "debugFile.txt")
                    with open(debug_file_path, "w+") as new_file:
                        new_file.write("\n".join(words))
                    print("Final vocab at " + debug_file_path)

            except Exception as e:
                print(e)
                warnings.warn("Using default vocab")
                vocab_path = ""
                words = []

        if vocab_path == "":
            current_path = os.path.dirname(__file__)
            vocab_path = os.path.join(current_path, "data/vocab.txt")
            with open(vocab_path, encoding="utf8") as f:
                # if want to remove '[unusedXX]' from vocab
                # words = [
                #     line.rstrip()
                #     for line in f
                #     if not line.startswith("[unused")
                # ]
                words = [line.strip() for line in f]

        self.max_edit_dist = max_edit_dist
        self.model_name = model_name
        self.vocab = Vocab(strings=words)
        logging.getLogger("transformers").setLevel(logging.ERROR)
        self.BertTokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.BertModel = AutoModelForMaskedLM.from_pretrained(self.model_name)
        self.BertModel.eval()
        self.BertModel = self.BertModel.to(device)
        self.mask = self.BertTokenizer.mask_token
        self.debug = debug
        self.performance = performance
        if not Doc.has_extension("contextual_spellCheck"):
            Doc.set_extension("contextual_spellCheck", default=True)
            Doc.set_extension("performed_spellCheck", default=False)

            Doc.set_extension("suggestions_spellCheck", default={})
            Doc.set_extension("outcome_spellCheck", default="")
            Doc.set_extension("score_spellCheck", default=None)

            Span.set_extension("get_has_spellCheck",
                               getter=self.span_require_spell_check)
            Span.set_extension("score_spellCheck",
                               getter=self.span_score_spell_check)

            Token.set_extension("get_require_spellCheck",
                                getter=self.token_require_spell_check)
            Token.set_extension(
                "get_suggestion_spellCheck",
                getter=self.token_suggestion_spell_check,
            )
            Token.set_extension("score_spellCheck",
                                getter=self.token_score_spell_check)
Ejemplo n.º 15
0
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizer
import os
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("clue/roberta_chinese_base")
model.save_pretrained('myroberta')
tokenizer.save_pretrained('myroberta')
os.remove("myroberta/special_tokens_map.json")
os.remove("myroberta/tokenizer_config.json")
# os.system("mv deberta-base ../")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
# model.save_pretrained('bert_model_uncased')
# tokenizer.save_pretrained('bert_model_uncased')
Ejemplo n.º 16
0
    # obtain model and tokenizer
    #  model_name = "bert-large-uncased-whole-word-masking"
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    phrase_tokenizer = PhraseTokenizer()

    #cwd/"saved_model"/"imdb_bert_base_uncased_finetuned_normal"
    if ds_name == "imdb":
        target_model_name = "imdb_bert_base_uncased_finetuned_training"
        target_model_path = cwd / "data" / "imdb" / "saved_model" / target_model_name
    elif ds_name == "yelp_polarity":
        target_model_name = "bert-base-uncased-yelp-polarity"
        target_model_path = f"textattack/{target_model_name}"
    target_model = BertForSequenceClassification.from_pretrained(
        str(target_model_path)).to(device)
    mlm_model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

    # turn models to eval model since only inference is needed
    target_model.eval()
    mlm_model.eval()

    # tokenize the dataset to include words and phrases
    test_ds = test_ds.map(phrase_tokenizer.tokenize)

    # create the attacker
    params = {
        'k': 15,
        'beam_width': 8,
        'conf_thres': 3.0,
        'sent_semantic_thres': 0.7,
        'change_threshold': 0.4
Ejemplo n.º 17
0
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
model.save_pretrained('model/pytorch_bert_base_uncased/')
Ejemplo n.º 18
0
from transformers import LongformerForMaskedLM,RobertaForMaskedLM,AutoModelForMaskedLM,AutoTokenizer
import copy
import torch

max_pos = 4096
attention_window = 512

roberta = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext", model_max_length=max_pos)

# extend position embedding
config = roberta.config
tokenizer.model_max_length = max_pos
tokenizer.init_kwargs['model_max_length'] = max_pos
current_max_pos, embed_size = roberta.bert.embeddings.position_embeddings.weight.shape
max_pos += 2
config.max_position_embeddings = max_pos
assert max_pos > current_max_pos

new_pos_embed = roberta.bert.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
# copy position embeddings over and over to initialize the new position embeddings
k = 2
step = current_max_pos - 2
while k < max_pos - 1:
    if k + step >= max_pos:
        new_pos_embed[k:] = roberta.bert.embeddings.position_embeddings.weight[2:(max_pos + 2 - k)]
    else:
        new_pos_embed[k:(k + step)] = roberta.bert.embeddings.position_embeddings.weight[2:]
    k += step
roberta.bert.embeddings.position_embeddings.weight.data = new_pos_embed
roberta.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)
Ejemplo n.º 19
0
def load_finetune(epoch):
    model = AutoModelForMaskedLM.from_pretrained('./save_ft_%d/' % epoch)
    return model
Ejemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--train_data_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input training data file (a text file).")
    parser.add_argument(
        "--logging_dir",
        type=str,
        required=True,
        help="The logs directory.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    parser.add_argument(
        "--model_type",
        type=str,
        required=True,
        help="The model architecture to be trained or fine-tuned.",
    )

    # Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        help=
        "An optional input evaluation data file to evaluate the perplexity on (a text file).",
    )
    parser.add_argument(
        "--line_by_line",
        action="store_true",
        help=
        "Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
    )
    parser.add_argument(
        "--lazy_loading",
        action="store_true",
        help=
        "Whether to use lazy data loading. Is necessarily line-by-line as well.",
    )
    parser.add_argument(
        "--force_pad_token",
        action="store_true",
        help=
        "Whether to force the addition of a padding token to tokenizer to prevent errors in encoding (e.g. with GPT)",
    )
    parser.add_argument(
        "--should_continue",
        action="store_true",
        help="Whether to continue from latest checkpoint in output_dir")
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        help=
        "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
    )

    parser.add_argument(
        "--mlm",
        action="store_true",
        help=
        "Train with masked-language modeling loss instead of language modeling."
    )
    parser.add_argument(
        "--mlm_probability",
        type=float,
        default=0.15,
        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument(
        "--config_name",
        default=None,
        type=str,
        help=
        "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
    )
    parser.add_argument(
        "--tokenizer_name",
        default=None,
        type=str,
        help=
        "Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
    )

    parser.add_argument(
        "--data_cache_dir",
        default=None,
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
    )

    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens).",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Run evaluation during training at each logging step.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")

    parser.add_argument(
        "--num_workers",
        default=0,
        type=int,
        help=
        "multi-process data loading with the specified number of loader worker processes.."
    )

    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_portion",
                        default=0.1,
                        type=float,
                        help="Linear warmup over total * warmup_portion.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=1000,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--save_total_limit",
        type=int,
        default=None,
        help=
        "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    global args
    args = parser.parse_args()

    args.warmup_portion = float(args.warmup_portion)
    args.inital_epoch = 0

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"
                           ] and not args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError(
                "Used --should_continue but no checkpoint was found in --output_dir."
            )
        else:
            args.model_name_or_path = sorted_checkpoints[-1]
            args.inital_epoch = int(args.model_name_or_path.split("-")[-1])

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir
            and not args.should_continue):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        # print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)
    logger.info("Rank %d. Word size %d", args.local_rank,
                torch.distributed.get_world_size())
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Barrier to make sure only the first process in distributed training download model & vocab

    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name,
                                            cache_dir=args.cache_dir,
                                            padding="max_length")
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path,
                                            cache_dir=args.cache_dir,
                                            padding="max_length")
    else:
        # When we release a pip version exposing CONFIG_MAPPING,
        # we can do `config = CONFIG_MAPPING[args.model_type]()`.
        raise ValueError(
            "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --config_name")

    global tokenizer

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,
                                                  cache_dir=args.cache_dir)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                                  cache_dir=args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if tokenizer.pad_token_id is None:
        if args.force_pad_token:
            # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn.
            # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token
            # when feeding to the model.
            tokenizer.add_special_tokens({"pad_token": "<pad>"})
        else:
            logger.warn(
                "Attempting to train a model whose tokenizer has no padding token. This may result in errors in the encoding step. Set the --force_pad_token flag to fix this."
            )

    if args.block_size <= 0:
        args.block_size = tokenizer.model_max_length
        # Our input block size will be the max possible for the model
    else:
        args.block_size = min(args.block_size, tokenizer.model_max_length)

    if args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir)
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    logger.info("Load model at Rank %d", args.local_rank)
    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier(
            )  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False)

        logger.info("Load dataset at Rank %d.", args.local_rank)

        if args.local_rank == 0:
            torch.distributed.barrier()

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForMaskedLM.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                "/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelForMaskedLM.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Ejemplo n.º 21
0
def main(cfg):
    cwd = get_original_cwd()
    os.chdir(cwd)
    if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"):
        get_label_word(cfg)
    if not os.path.exists(cfg.data_dir):
        generate_k_shot(cfg.data_dir)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    data = REDataset(cfg)
    data_config = data.get_data_config()

    config = AutoConfig.from_pretrained(cfg.model_name_or_path)
    config.num_labels = data_config["num_labels"]

    model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path,
                                                 config=config)

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count())))

    model.to(device)

    lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer)
    data.setup()

    if cfg.train_from_saved_model != '':
        model.load_state_dict(
            torch.load(cfg.train_from_saved_model)["checkpoint"])
        print("load saved model from {}.".format(cfg.train_from_saved_model))
        lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"]
    #data.tokenizer.save_pretrained('test')

    optimizer = lit_model.configure_optimizers()
    if cfg.train_from_saved_model != '':
        optimizer.load_state_dict(
            torch.load(cfg.train_from_saved_model)["optimizer"])
        print("load saved optimizer from {}.".format(
            cfg.train_from_saved_model))

    num_training_steps = len(data.train_dataloader(
    )) // cfg.gradient_accumulation_steps * cfg.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_training_steps * 0.1,
        num_training_steps=num_training_steps)
    log_step = 100

    logging(cfg.log_dir, '-' * 89, print_=False)
    logging(cfg.log_dir,
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) +
            ' INFO : START TO TRAIN ',
            print_=False)
    logging(cfg.log_dir, '-' * 89, print_=False)

    for epoch in range(cfg.num_train_epochs):
        model.train()
        num_batch = len(data.train_dataloader())
        total_loss = 0
        log_loss = 0
        for index, train_batch in enumerate(tqdm(data.train_dataloader())):
            loss = lit_model.training_step(train_batch, index)
            total_loss += loss.item()
            log_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if log_step > 0 and (index + 1) % log_step == 0:
                cur_loss = log_loss / log_step
                logging(
                    cfg.log_dir,
                    '| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'.
                    format(epoch, (index + 1), scheduler.get_last_lr(),
                           cur_loss * 1000),
                    print_=False)
                log_loss = 0
        avrg_loss = total_loss / num_batch
        logging(
            cfg.log_dir, '| epoch {:2d} | train loss {:5.3f}'.format(
                epoch, avrg_loss * 1000))

        model.eval()
        with torch.no_grad():
            val_loss = []
            for val_index, val_batch in enumerate(tqdm(data.val_dataloader())):
                loss = lit_model.validation_step(val_batch, val_index)
                val_loss.append(loss)
            f1, best, best_f1 = lit_model.validation_epoch_end(val_loss)
            logging(cfg.log_dir, '-' * 89)
            logging(cfg.log_dir,
                    '| epoch {:2d} | dev_result: {}'.format(epoch, f1))
            logging(cfg.log_dir, '-' * 89)
            logging(cfg.log_dir, '| best_f1: {}'.format(best_f1))
            logging(cfg.log_dir, '-' * 89)
            if cfg.save_path != "" and best != -1:
                save_path = cfg.save_path
                torch.save(
                    {
                        'epoch': epoch,
                        'checkpoint': model.state_dict(),
                        'best_f1': best_f1,
                        'optimizer': optimizer.state_dict()
                    },
                    save_path,
                    _use_new_zipfile_serialization=False)
                logging(cfg.log_dir,
                        '| successfully save model at: {}'.format(save_path))
                logging(cfg.log_dir, '-' * 89)
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForMaskedLM.from_pretrained('distilbert-base-cased')

sequence = f'Distilled models are smaller than the models they mimic. Using them instead of the' \
           f' large versions would help {tokenizer.mask_token} our carbon footprint.'
print(sequence)

inputs = tokenizer.encode(sequence, return_tensors='pt')
mask_token_index = torch.where(inputs == tokenizer.mask_token_id)[1]

token_logits = model(inputs).logits
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
Ejemplo n.º 23
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
    accelerator = Accelerator(
        log_with="all",
        logging_dir=args.output_dir) if args.with_tracking else Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name,
                                               token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"),
                      "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension, data_files=data_files)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{args.validation_split_percentage}%:]",
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    if args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples[text_column_name] = [
                line for line in examples[text_column_name]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(
                examples[text_column_name],
                padding=padding,
                truncation=True,
                max_length=max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

        with accelerator.main_process_first():
            tokenized_datasets = raw_datasets.map(
                tokenize_function,
                batched=True,
                num_proc=args.preprocessing_num_workers,
                remove_columns=[text_column_name],
                load_from_cache_file=not args.overwrite_cache,
                desc="Running tokenizer on dataset line_by_line",
            )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name],
                             return_special_tokens_mask=True)

        with accelerator.main_process_first():
            tokenized_datasets = raw_datasets.map(
                tokenize_function,
                batched=True,
                num_proc=args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not args.overwrite_cache,
                desc="Running tokenizer on every text in dataset",
            )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: list(chain(*examples[k]))
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            if total_length >= max_seq_length:
                total_length = (total_length //
                                max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        with accelerator.main_process_first():
            tokenized_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                num_proc=args.preprocessing_num_workers,
                load_from_cache_file=not args.overwrite_cache,
                desc=f"Grouping texts in chunks of {max_seq_length}",
            )

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Conditional for small test subsets
    if len(train_dataset) > 3:
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(
                f"Sample {index} of the training set: {train_dataset[index]}.")

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=args.mlm_probability)

    # DataLoaders creation:
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
    if accelerator.distributed_type == DistributedType.TPU:
        model.tie_weights()

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler)

    # Figure out how many steps we should save the Accelerator states
    if hasattr(args.checkpointing_steps, "isdigit"):
        checkpointing_steps = args.checkpointing_steps
        if args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
    else:
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_tracking:
        experiment_config = vars(args)
        # TensorBoard cannot log Enums, need the raw value
        experiment_config["lr_scheduler_type"] = experiment_config[
            "lr_scheduler_type"].value
        accelerator.init_trackers("mlm_no_trainer", experiment_config)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(
                f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[
                -1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        for step, batch in enumerate(train_dataloader):
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
                    completed_steps += 1
                    continue
            outputs = model(**batch)
            loss = outputs.loss
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps }"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses.append(
                accelerator.gather(loss.repeat(
                    args.per_device_eval_batch_size)))

        losses = torch.cat(losses)
        losses = losses[:len(eval_dataset)]
        try:
            perplexity = math.exp(torch.mean(losses))
        except OverflowError:
            perplexity = float("inf")

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

        if args.with_tracking:
            accelerator.log(
                {
                    "perplexity": perplexity,
                    "train_loss": total_loss,
                    "epoch": epoch,
                    "step": completed_steps
                }, )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}",
                    blocking=False,
                    auto_lfs_prune=True)

        if args.checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args.output_dir,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training",
                                 auto_lfs_prune=True)

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"perplexity": perplexity}, f)
Ejemplo n.º 24
0
def run(n_epochs, lr, batch_size, base_model, clustering_loss_weight,
        embedding_extractor, annealing_alphas, dataset, train_idx_file,
        result_dir, early_stopping, early_stopping_tol, device, random_state):
    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))

    df = df.iloc[train_idx].copy()

    texts = df['texts'].to_numpy()
    labels = df['labels'].to_numpy()

    data = TextDataset(texts, labels)
    data_loader = DataLoader(dataset=data,
                             batch_size=batch_size,
                             shuffle=False)

    # init lm model & tokenizer
    lm_model = AutoModelForMaskedLM.from_pretrained(base_model,
                                                    return_dict=True,
                                                    output_hidden_states=True)
    tokenizer = AutoTokenizer.from_pretrained(base_model,
                                              return_dict=True,
                                              output_hidden_states=True)

    lm_model.to(device)

    # init clustering model
    model, initial_centroids, initial_embeddings = init_model(
        lm_model=lm_model,
        tokenizer=tokenizer,
        data_loader=data_loader,
        embedding_extractor=embedding_extractor,
        n_clusters=np.unique(labels).shape[0],
        device=device)

    # init optimizer & scheduler
    opt = torch.optim.RMSprop(
        params=model.parameters(),
        lr=lr,  # 2e-5, 5e-7,
        eps=1e-8)

    total_steps = len(data_loader) * n_epochs

    scheduler = get_linear_schedule_with_warmup(optimizer=opt,
                                                num_warmup_steps=int(
                                                    len(data_loader) * 0.5),
                                                num_training_steps=total_steps)

    # train the model
    hist = train(n_epochs=n_epochs,
                 model=model,
                 optimizer=opt,
                 scheduler=scheduler,
                 annealing_alphas=annealing_alphas,
                 train_data_loader=data_loader,
                 clustering_loss_weight=clustering_loss_weight,
                 early_stopping=early_stopping,
                 early_stopping_tol=early_stopping_tol,
                 verbose=True)

    # save results & model
    os.makedirs(result_dir)
    with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f:
        pickle.dump(hist, file=f)

    torch.save(model, os.path.join(result_dir, 'model.bin'))
Ejemplo n.º 25
0
        "Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]"
    )
    exit()

model_name = sys.argv[1]
per_device_train_batch_size = 64

save_steps = 1000  #Save model every 1k steps
num_train_epochs = 3  #Number of epochs
use_fp16 = False  #Set to True, if your GPU supports FP16 operations
max_length = 100  #Max length for a text input
do_whole_word_mask = True  #If set to true, whole words are masked
mlm_prob = 0.15  #Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

output_dir = "output/{}-{}".format(
    model_name.replace("/", "_"),
    datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
print("Save checkpoints to:", output_dir)

##### Load our training datasets

train_sentences = []
train_path = sys.argv[2]
with gzip.open(train_path, 'rt',
               encoding='utf8') if train_path.endswith('.gz') else open(
                   train_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
Ejemplo n.º 26
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }

    # tokenizer = GPT2TokenizerFast(
    #     os.path.join('../tokenizer', 'vocab.json'),
    #     os.path.join('../tokenizer', 'merges.txt'),
    #     bos_token='<s>',
    #     eos_token='</s>',
    #     sep_token='</s>',
    #     cls_token='<s>',
    #     unk_token='<unk>',
    #     pad_token='<pad>',
    #     mask_token='<mask>',
    # )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    tokenizer.add_special_tokens({
        'bos_token': '<s>',
        'eos_token': '</s>',
        'sep_token': '</s>',
        'cls_token': '<s>',
        'unk_token': '<unk>',
        'pad_token': '<pad>',
        'mask_token': '<mask>',
    })
    # tokenizer.convert_tokens_to_ids(s)
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    model = AutoModelForMaskedLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model.resize_token_embeddings(len(tokenizer))

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    train_dataset = Dataset(paths=['../bin_data/{}_text_document'.format(i) for i in range(4)],
                            tokenizer=tokenizer)
    eval_dataset = Dataset(paths='../bin_data/4_text_document', tokenizer=tokenizer)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
        perplexity = math.exp(metrics["eval_loss"])
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)
Ejemplo n.º 27
0
def loadmodel(epoch):
    model = AutoModelForMaskedLM.from_pretrained('./save%d/'%epoch)
    return model
Ejemplo n.º 28
0
def stage1pretrain():
    logger.info("stage1pretrain starts")
    config = PretrainConfig()
    if config.train_file is not None:
        extension = config.train_file.split(".")[-1]
        assert extension in [
            "csv", "json", "txt"
        ], "`train_file` should be a csv, json or txt file."
    if config.validation_file is not None:
        extension = config.validation_file.split(".")[-1]
        assert extension in [
            "csv", "json", "txt"
        ], "`validation_file` should be a csv, json or txt file."
    if config.output_dir is not None:
        os.makedirs(config.output_dir, exist_ok=True)

    saveDataWithTextsOnly("../../data/commonlitreadability/train.csv",
                          "../../data/commonlitreadability/test.csv")

    accelerator = Accelerator()
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)

    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    if config.seed is not None:
        set_seed(config.seed)

    data_files = {}
    if config.train_file is not None:
        data_files["train"] = config.train_file
    if config.validation_file is not None:
        data_files["validation"] = config.validation_file
    extension = config.train_file.split(".")[-1]
    if extension == "txt":
        extension = "text"
    raw_datasets = load_dataset(extension, data_files=data_files)

    if config.config_name:
        modelconfig = AutoConfig.from_pretrained(config.config_name)
    elif config.model_name_or_path:
        modelconfig = AutoConfig.from_pretrained(config.model_name_or_path)
    else:
        modelconfig = CONFIG_MAPPING[config.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if config.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            config.tokenizer_name, use_fast=not config.use_slow_tokenizer)
    elif config.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_name_or_path, use_fast=not config.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            config.model_name_or_path,
            from_tf=bool(".ckpt" in config.model_name_or_path),
            config=modelconfig,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(modelconfig)

    model.resize_token_embeddings(len(tokenizer))

    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if config.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if config.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({config.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(config.max_seq_length, tokenizer.model_max_length)

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name],
                         return_special_tokens_mask=True)

    def group_texts(examples):
        concatenated_examples = {
            k: sum(examples[k], [])
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // max_seq_length) * max_seq_length
        result = {
            k: [
                t[i:i + max_seq_length]
                for i in range(0, total_length, max_seq_length)
            ]
            for k, t in concatenated_examples.items()
        }
        return result

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=config.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not config.overwrite_cache,
    )

    tokenized_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=config.preprocessing_num_workers,
        load_from_cache_file=not config.overwrite_cache,
    )
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=config.mlm_probability)
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=data_collator,
        batch_size=config.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=config.per_device_eval_batch_size)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / config.gradient_accumulation_steps)
    if config.max_train_steps is None:
        config.max_train_steps = config.num_train_epochs * num_update_steps_per_epoch
    else:
        config.num_train_epochs = math.ceil(config.max_train_steps /
                                            num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=config.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=config.num_warmup_steps,
        num_training_steps=config.max_train_steps,
    )

    total_batch_size = config.per_device_train_batch_size * accelerator.num_processes * config.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {config.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {config.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {config.gradient_accumulation_steps}"
    )
    logger.info(f"  Total optimization steps = {config.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(config.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(config.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / config.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % config.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= config.max_train_steps:
                break

        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses.append(
                accelerator.gather(
                    loss.repeat(config.per_device_eval_batch_size)))

        losses = torch.cat(losses)
        losses = losses[:len(eval_dataset)]
        perplexity = math.exp(torch.mean(losses))

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

    if config.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(config.output_dir,
                                        save_function=accelerator.save)
Ejemplo n.º 29
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warn(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warn(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name],
                             return_special_tokens_mask=True)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(
                data_args.max_val_samples))

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif model_args.model_name_or_path is not None and os.path.isdir(
                model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
        perplexity = math.exp(metrics["eval_loss"])
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)
Ejemplo n.º 30
0
def fine_tune(cfg: DictConfig) -> float:
    """fine tune bert module"""
    init_wandb(cfg)

    tokenizer = AutoTokenizer.from_pretrained(cfg["module"]["arch"])
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.15,
    )
    config = AutoConfig.from_pretrained(cfg.model.arch,
                                        num_labels=cfg.model.num_labels)

    model = AutoModelForMaskedLM.from_pretrained(cfg.model.arch, config=config)
    model.resize_token_embeddings(len(tokenizer))

    train_ds, test_ds = getDataset(cfg, tokenizer)

    id = wandb.run.name.rsplit("-", 1)[1]
    trainConfig = cfg.train
    output_dir = os.path.join(trainConfig["output_dir"], id)
    print("module output dir = ", output_dir)
    train_args = TrainingArguments(
        # module pred/ckpt
        output_dir=output_dir,
        # tensorboard logs
        logging_dir="./logs",
        num_train_epochs=trainConfig["epoch"],
        per_device_train_batch_size=trainConfig["train_batch_size"],
        per_device_eval_batch_size=trainConfig["eval_batch_size"],
        # x (logging / eval /save) every acc * x_steps
        gradient_accumulation_steps=trainConfig["acc_batch"],
        evaluation_strategy=IntervalStrategy.EPOCH,
        label_smoothing_factor=trainConfig["label_smooth"],
        # AdamW
        learning_rate=trainConfig["lr"],
        warmup_steps=trainConfig["warmup"],
        # apply to all layers but bias / LayerNorm
        weight_decay=trainConfig["wd"],
        # save_total_limit=2,
        # if True, ignore param save_strategy / save_steps / save_total_limit
        load_best_model_at_end=True,
        # report_to=["none"],
        report_to=["wandb"],
        seed=cfg.seed,
        # logging_strategy=IntervalStrategy.STEPS,
        # metric_for_best_model=trainConfig["metric"]
    )

    trainer = Trainer(
        model,
        args=train_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        # callbacks=[
        #     EarlyStoppingCallback(early_stopping_patience=trainConfig["early_stopping_patience"]),
        # ],
        # compute_metrics=compute_metrics,
    )

    print("logs in dir", os.getcwd())
    print("gpu count = ", trainer.args.n_gpu, "is_fp16 =", trainer.args.fp16)

    train_result = trainer.train()
    trainer.save_model()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    metrics = trainer.evaluate()

    try:
        perplexity = math.exp(metrics["eval_loss"])
    except OverflowError:
        perplexity = float("inf")
    metrics["perplexity"] = perplexity
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

    # best module
    trainer.model.save_pretrained(os.path.join(output_dir, "best"))