Esempio n. 1
0
    def __init__(self,
                 n_labels,
                 hidden_size,
                 dropout=0.2,
                 label_ignore_idx=0,
                 max_seq_length=128,
                 batch_size=32,
                 head_init_range=0.04,
                 device='cuda',
                 vocab_size=320):
        super().__init__()
        self.n_labels = n_labels

        self.linear_1 = nn.Linear(hidden_size, hidden_size)
        self.classification_head = nn.Linear(hidden_size, n_labels)
        self.label_ignore_idx = label_ignore_idx
        self.tokenizer = ReformerTokenizer.from_pretrained(
            'google/reformer-crime-and-punishment')
        config = ReformerConfig(
            axial_pos_shape=[batch_size,
                             int(max_seq_length / batch_size)])
        self.model = ReformerModel(config)
        self.dropout = nn.Dropout(dropout)

        self.device = device

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0,
                                                     std=head_init_range)
def prepare_dataset(max_length):
    # get pretrained tokenizer
    tokenizer = ReformerTokenizer.from_pretrained(
        "patrickvonplaten/reformer-crime-and-punish")

    # define our map function to reduce the dataset to one sample
    def flatten_and_tokenize(batch):
        all_input_text = ["".join(batch["line"])]
        input_ids_dict = tokenizer.batch_encode_plus(
            all_input_text,
            pad_to_max_length=True,
            max_length=max_length,
        )

        # duplicate data 8 times to have have 8 examples in dataset
        for key in input_ids_dict.keys():
            input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0]
        return input_ids_dict

    # load the dataset
    dataset = nlp.load("crime_and_punish", split="train")

    # reduce the dataset
    dataset = dataset.map(flatten_and_tokenize,
                          batched=True,
                          batch_size=-1,
                          remove_columns=["line"])

    # prepare dataset to be in torch format
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    return dataset
Esempio n. 3
0
    def test_pretrained_generate_use_cache_equality(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()
        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
        output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)

        output_with_cache = tokenizer.decode(output_ids_with_cache[0])
        output_without_cache = tokenizer.decode(output_ids_without_cache[0])

        self.assertEqual(output_with_cache, output_without_cache)
    def test_pretrained_generate_crime_and_punish(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()

        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids = model.generate(
            input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
        )
        output_text = tokenizer.decode(output_ids[0])
        self.assertEqual(
            output_text,
            "A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
        )
    def test_tokenization_reformer(self):
        # Given
        self.base_tokenizer = ReformerTokenizer.from_pretrained(
            'google/reformer-crime-and-punishment',
            do_lower_case=False,
            cache_dir=self.test_dir)
        self.rust_tokenizer = PyReformerTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['google/reformer-crime-and-punishment']),
                                                  do_lower_case=True)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f'Python {baseline["input_ids"]}'
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
 def big_tokenizer(self):
     return ReformerTokenizer.from_pretrained(
         "google/reformer-crime-and-punishment")
Esempio n. 7
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath('args.json'))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    tokenizer = ReformerTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = ReformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_examples = DeepThinkDataset(data_args.input_train_file)
    train_dataset = DTDataset(tokenizer, train_examples,
                              data_args.max_seq_length)
    eval_examples = DeepThinkDataset(data_args.input_eval_file)
    eval_dataset = DTDataset(tokenizer, eval_examples,
                             data_args.max_seq_length)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results
Esempio n. 8
0
# from transformers import pipeline
# nlp = pipeline("sentiment-analysis")
# result = nlp("I hate you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
# result = nlp("I love you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

from transformers import ReformerTokenizer, ReformerModel
import torch
tokenizer = ReformerTokenizer.from_pretrained(
    'google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment',
                                      return_dict=True)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)