def test_inference_masked_lm(self): tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver") model.to(torch_device) # prepare inputs text = "This is an incomplete sentence where some words are missing." encoding = tokenizer(text, padding="max_length", return_tensors="pt") # mask " missing.". encoding.input_ids[0, 52:61] = tokenizer.mask_token_id inputs, input_mask = encoding.input_ids.to(torch_device), encoding.attention_mask.to(torch_device) # forward pass with torch.no_grad(): outputs = model(inputs=inputs, attention_mask=input_mask) logits = outputs.logits # verify logits expected_shape = torch.Size((1, tokenizer.model_max_length, tokenizer.vocab_size)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [[-10.8609, -10.7651, -10.9187], [-12.1689, -11.9389, -12.1479], [-12.1518, -11.9707, -12.2073]], device=torch_device, ) self.assertTrue(torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)) expected_greedy_predictions = [38, 115, 111, 121, 121, 111, 116, 109, 52] masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist() self.assertListEqual(expected_greedy_predictions, masked_tokens_predictions)
def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"): """ Copy/paste/tweak model's weights to our Perceiver structure. """ # load parameters as FlatMapping data structure with open(pickle_file, "rb") as f: checkpoint = pickle.loads(f.read()) state = None if isinstance(checkpoint, dict) and architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv", ]: # the image classification_conv checkpoint also has batchnorm states (running_mean and running_var) params = checkpoint["params"] state = checkpoint["state"] else: params = checkpoint # turn into initial state dict state_dict = dict() for scope_name, parameters in hk.data_structures.to_mutable_dict( params).items(): for param_name, param in parameters.items(): state_dict[scope_name + "/" + param_name] = param if state is not None: # add state variables for scope_name, parameters in hk.data_structures.to_mutable_dict( state).items(): for param_name, param in parameters.items(): state_dict[scope_name + "/" + param_name] = param # rename keys rename_keys(state_dict, architecture=architecture) # load HuggingFace model config = PerceiverConfig() subsampling = None repo_id = "datasets/huggingface/label-files" if architecture == "MLM": config.qk_channels = 8 * 32 config.v_channels = 1280 model = PerceiverForMaskedLM(config) elif "image_classification" in architecture: config.num_latents = 512 config.d_latents = 1024 config.d_model = 512 config.num_blocks = 8 config.num_self_attends_per_block = 6 config.num_cross_attention_heads = 1 config.num_self_attention_heads = 8 config.qk_channels = None config.v_channels = None # set labels config.num_labels = 1000 filename = "imagenet-1k-id2label.json" id2label = json.load( open(cached_download(hf_hub_url(repo_id, filename)), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} if architecture == "image_classification": config.image_size = 224 model = PerceiverForImageClassificationLearned(config) elif architecture == "image_classification_fourier": config.d_model = 261 model = PerceiverForImageClassificationFourier(config) elif architecture == "image_classification_conv": config.d_model = 322 model = PerceiverForImageClassificationConvProcessing(config) else: raise ValueError(f"Architecture {architecture} not supported") elif architecture == "optical_flow": config.num_latents = 2048 config.d_latents = 512 config.d_model = 322 config.num_blocks = 1 config.num_self_attends_per_block = 24 config.num_self_attention_heads = 16 config.num_cross_attention_heads = 1 model = PerceiverForOpticalFlow(config) elif architecture == "multimodal_autoencoding": config.num_latents = 28 * 28 * 1 config.d_latents = 512 config.d_model = 704 config.num_blocks = 1 config.num_self_attends_per_block = 8 config.num_self_attention_heads = 8 config.num_cross_attention_heads = 1 config.num_labels = 700 # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data) images = torch.randn((1, 16, 3, 224, 224)) audio = torch.randn((1, 30720, 1)) nchunks = 128 image_chunk_size = np.prod((16, 224, 224)) // nchunks audio_chunk_size = audio.shape[1] // config.samples_per_patch // nchunks # process the first chunk chunk_idx = 0 subsampling = { "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)), "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)), "label": None, } model = PerceiverForMultimodalAutoencoding(config) # set labels filename = "kinetics700-id2label.json" id2label = json.load( open(cached_download(hf_hub_url(repo_id, filename)), "r")) id2label = {int(k): v for k, v in id2label.items()} config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} else: raise ValueError(f"Architecture {architecture} not supported") model.eval() # load weights model.load_state_dict(state_dict) # prepare dummy input input_mask = None if architecture == "MLM": tokenizer = PerceiverTokenizer.from_pretrained( "/Users/NielsRogge/Documents/Perceiver/Tokenizer files") text = "This is an incomplete sentence where some words are missing." encoding = tokenizer(text, padding="max_length", return_tensors="pt") # mask " missing.". Note that the model performs much better if the masked chunk starts with a space. encoding.input_ids[0, 51:60] = tokenizer.mask_token_id inputs = encoding.input_ids input_mask = encoding.attention_mask elif architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv" ]: feature_extractor = PerceiverFeatureExtractor() image = prepare_img() encoding = feature_extractor(image, return_tensors="pt") inputs = encoding.pixel_values elif architecture == "optical_flow": inputs = torch.randn(1, 2, 27, 368, 496) elif architecture == "multimodal_autoencoding": images = torch.randn((1, 16, 3, 224, 224)) audio = torch.randn((1, 30720, 1)) inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700))) # forward pass if architecture == "multimodal_autoencoding": outputs = model(inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling) else: outputs = model(inputs=inputs, attention_mask=input_mask) logits = outputs.logits # verify logits if not isinstance(logits, dict): print("Shape of logits:", logits.shape) else: for k, v in logits.items(): print(f"Shape of logits of modality {k}", v.shape) if architecture == "MLM": expected_slice = torch.tensor([[-11.8336, -11.6850, -11.8483], [-12.8149, -12.5863, -12.7904], [-12.8440, -12.6410, -12.8646]]) assert torch.allclose(logits[0, :3, :3], expected_slice) masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist() expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52] assert masked_tokens_predictions == expected_list print("Greedy predictions:") print(masked_tokens_predictions) print() print("Predicted string:") print(tokenizer.decode(masked_tokens_predictions)) elif architecture in [ "image_classification", "image_classification_fourier", "image_classification_conv" ]: print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) # Finally, save files Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)
def perceiver_tokenizer(self): return PerceiverTokenizer.from_pretrained( "deepmind/language-perceiver")
def setUp(self): super().setUp() tokenizer = PerceiverTokenizer() tokenizer.save_pretrained(self.tmpdirname)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) config = PerceiverConfig( num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, d_latents=model_args.d_latents, d_model=model_args.d_model, max_position_embeddings=model_args.max_position_embeddings, ) tokenizer = PerceiverTokenizer( cache_dir=model_args.cache_dir, model_max_length=model_args.model_max_length) model = PerceiverForTokenClassification(config=config) model.main_input_name = "input_ids" # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, # return_offsets_mapping=True, ) labels = [] """ There is no PerceiverTokenizerFast, follow code works for conll2003 datasets. words: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] words_labels: [3, 0, 7, 0, 0, 0, 7, 0, 0] tokens_labels: [-100, 3, 3, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, -100] """ for i in range(len(tokenized_inputs["input_ids"])): # -2 special token [BOS] [SEP] id_length = len(tokenized_inputs["input_ids"][i]) - 2 char_length = len("".join(examples[text_column_name][i])) assert id_length == char_length label_ids = [-100] words = examples[text_column_name][i] words_labels = examples[label_column_name][i] for w, w_label in zip(words, words_labels): current_label = label_to_id[w_label] label_ids.extend([current_label] * len(w)) label_ids.append(-100) assert len(label_ids) == id_length + 2 labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = (data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n")