def test_load_real_metric(self, metric_name): with tempfile.TemporaryDirectory() as temp_data_dir: download_config = DownloadConfig() download_config.download_mode = GenerateMode.FORCE_REDOWNLOAD load_metric(metric_name, data_dir=temp_data_dir, download_config=download_config)
def _eval_end(self, outputs, split="val"): val_loss_mean = torch.stack([x[f"{split}_loss"] for x in outputs]).mean().detach().cpu() preds = np.concatenate([x["pred"] for x in outputs], axis=0) if self.hparams.glue_output_mode == "classification": preds = np.argmax(preds, axis=1) elif self.hparams.glue_output_mode == "regression": preds = np.squeeze(preds) out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0) results = {f"{split}_loss": val_loss_mean} to_return = (results, preds) # For validation dataset, include metric results. if split != "test": # HACK - to avoid pickle error I didn't assign this as class attribute metric = nlp.load_metric("glue", name=self.hparams.task) # HACK - the .tolist() call here is to prevent an error: # pyarrow.lib.ArrowInvalid: Floating point value truncated error results.update(metric.compute(preds.tolist(), out_label_ids.tolist())) # Test dataset should include idxs for submission else: idxs = np.concatenate([x["idx"] for x in outputs], axis=0) to_return += (idxs,) return to_return
def compute_rouge( self, dataset, document_column_name, summary_colunm_name, rouge_types=["rouge1", "rouge2", "rougeL"], **kwargs, ): """ Generate hypotheses and compute ROUGE score between summaries and hypotheses Args: dataset (nlp.Dataset): dataset containing document to summarize document_column_name (str): name of the column of the dataset containing documents summary_colunm_name (str): name of the column of the dataset containing summaries rouge_types (lst(str)): list of ROUGE types you want to compute **kwargs: arguments to pass to the run function Return: score (dict(Score)): dict of ROUGE types with the score (see nlp metrics for details) """ dataset = self.get_summaries(dataset, document_column_name, **kwargs) rouge_metric = load_metric("rouge") def compute_rouge_batch(example): predictions = example[f"{self.name}_hypothesis"] references = example[summary_colunm_name] rouge_metric.add_batch(predictions, references) dataset.map(compute_rouge_batch, batched=True) return dataset, rouge_metric.compute(rouge_types=rouge_types)
def __init__(self, source_language: str, target_language: str): self.source_language = source_language self.target_language = target_language forward_model_name = f'Helsinki-NLP/opus-mt-{source_language}-{target_language}' self.forward_model = MarianMTModel.from_pretrained(forward_model_name) self.forward_tokenizer = MarianTokenizer.from_pretrained( forward_model_name) self.forward_model.to('cuda') backward_model_name = f'Helsinki-NLP/opus-mt-{target_language}-{source_language}' self.backward_model = MarianMTModel.from_pretrained( backward_model_name) self.backward_tokenizer = MarianTokenizer.from_pretrained( backward_model_name) self.backward_model.to('cuda') self.bleu_metric = load_metric("bleu") self.bert_score_metric = load_metric("bertscore", device="cuda")
def _init_rouge(self, rouge_type, rouge_method): self.rouge_metric = load_metric("rouge") self.rouge_type = rouge_type if rouge_method == "precision": self.rouge_method = 0 elif rouge_method == "recall": self.rouge_method = 1 elif rouge_method == "fmeasure": self.rouge_method = 2 else: raise ValueError( 'rouge_method must be "precision", "recall" or "fmeasure"')
def __init__(self, name, rouge_type="rouge2", rouge_method="precision"): super().__init__(name) self.rouge_metric = load_metric("rouge") self.rouge_type = rouge_type if rouge_method == "precision": self.rouge_method = 0 elif rouge_method == "recall": self.rouge_method = 1 elif rouge_method == "fmeasure": self.rouge_method = 2 else: raise ValueError( 'rouge_method must be "precision", "recall" or "fmeasure"')
def test_something(self): # Print all the available datasets print([dataset.id for dataset in nlp.list_datasets()]) # Load a dataset and print the first examples in the training set squad_dataset = nlp.load_dataset('squad') print(squad_dataset['train'][0]) # List all the available metrics print([metric.id for metric in nlp.list_metrics()]) # Load a metric squad_metric = nlp.load_metric('squad') self.assertEqual(True, True)
def test_load_real_metric(self, metric_name): with tempfile.TemporaryDirectory() as temp_data_dir: download_config = DownloadConfig() download_config.force_download = True name = None if metric_name == "glue": name = "sst2" metric = load_metric(metric_name, name=name, data_dir=temp_data_dir, download_config=download_config) parameters = inspect.signature(metric._compute).parameters self.assertTrue("predictions" in parameters) self.assertTrue("references" in parameters) self.assertTrue( all([p.kind != p.VAR_KEYWORD for p in parameters.values()])) # no **kwargs
def __init__(self, hparams) -> None: super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained( hparams.model_name) self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name) self.rouge_metric = load_metric('rouge') if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: self.freeze_params(self.model.get_encoder()) self.assert_all_frozen(self.model.get_encoder()) n_observations_per_split = { "train": self.hparams.n_train, "validation": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() }
def __init__(self, module, compute_args={}): self.scorer = nlp.load_metric(module) self.compute_args = compute_args
import sacrebleu from nlp import load_metric # target = open("./data/en-fi/test.trg") # output = open("./outputs/en-fi.txt") target = open("./data/en-fi/newstest2017-enfi.fi") output = open("./outputs/newstest2017-en-fi.txt") bert_score_metric = load_metric("bertscore", device="cuda") targets = [] outputs = [] for target_sample, output_sample in zip(target, output): targets.append(target_sample) outputs.append(output_sample) print(sacrebleu.corpus_bleu(outputs, [targets]).score) print(bert_score_metric.compute( outputs, targets, lang='fi', model_type="roberta-base", device="cuda" )['f1'].mean())
save(train_loader, save_path) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('-data_path', default='../datasets/', type=str) parser.add_argument('-data_name', default='debate', type=str) parser.add_argument('-mode', default='train', type=str) parser.add_argument('-batch_size', default=4, type=int) parser.add_argument('-random_seed', type=int, default=199744) parser.add_argument('-minor_data', action='store_true') parser.add_argument('-percentage', default=100, type=int) args = parser.parse_args() import nlp rouge = nlp.load_metric('rouge') # set random seed random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) torch.backends.cudnn.deterministic = True # if use minor data if args.minor_data: print('makeing dataset for {}% data'.format(str(args.percentage))) if not os.path.exists(args.data_path + args.data_name + '/minor_data'): os.makedirs(args.data_path + args.data_name + 'minor_data') multi_news_builder(args)
def __init__(self, config, word_emb, con_emb, word_vocab, concept_vocab, relation_vocab): super(DualTransformer, self).__init__() self.config = config self.word_vocab = word_vocab self.concept_vocab = concept_vocab self.relation_vocab = relation_vocab self.enc_word_embedding = self.build_embedding(word_emb, word_vocab, self.config.d_enc_sent) self.word_encoder = SentTransformer(config, self.enc_word_embedding, word_vocab) if config.dual_enc and self.concept_vocab is not None and relation_vocab is not None: if config.share_con_vocab: self.enc_concept_embedding = self.enc_word_embedding else: self.enc_concept_embedding = self.build_embedding( con_emb, concept_vocab, self.config.d_enc_concept) self.graph_encoder = GraphTransformer(config, self.enc_concept_embedding, concept_vocab, relation_vocab) else: self.graph_encoder = None self.dec_word_embedding = self.enc_word_embedding self.position_encoder = PositionalEncoding(config.d_dec) dual_mode = getattr(config, "dual_mode", "cat") if config.dual_enc: if dual_mode == "cat": decoder_layer = DoubleAttnTransformerDecoderLayer( d_model=config.d_dec, d_sent=config.d_enc_sent, d_con=config.d_enc_concept, heads=config.n_head, d_ff=1024, dropout=config.dropout, att_drop=config.dropout, dual_enc=config. dual_enc, # dual_enc=False when use single sentence encoder ) elif dual_mode == "graph_first": decoder_layer = DoubleAttnTransformerDecoderLayerGraphFirst( d_model=config.d_dec, d_enc=config.d_model + config.d_concept if config.dual_enc else config.d_model, heads=config.n_head, d_ff=1024, dropout=config.dropout, att_drop=config.dropout, dual_enc=config. dual_enc, # dual_enc=False when use single sentence encoder ) elif dual_mode == "sent_first": decoder_layer = DoubleAttnTransformerDecoderLayerSentFirst( d_model=config.d_dec, d_enc=config.d_model + config.d_concept if config.dual_enc else config.d_model, heads=config.n_head, d_ff=1024, dropout=config.dropout, att_drop=config.dropout, dual_enc=config. dual_enc, # dual_enc=False when use single sentence encoder ) else: print( 'Invalid dual_mode, should in (cat, graph_first, sent_first)' ) else: decoder_layer = DoubleAttnTransformerDecoderLayer( d_model=config.d_dec, d_sent=config.d_enc_sent, d_con=config.d_enc_concept, heads=config.n_head, d_ff=1024, dropout=config.dropout, att_drop=config.dropout, dual_enc=config. dual_enc, # dual_enc=False when use single sentence encoder ) decoder_norm = nn.LayerNorm(config.d_dec) self.decoder = DoubleAttnTransformerDecoder(decoder_layer, config.num_layer, decoder_norm) if word_vocab is not None: self.word_vocab_size = len(self.word_vocab) self.BOS = self.word_vocab["<bos>"] self.EOS = self.word_vocab["<eos>"] self.projector = nn.Linear(config.d_dec, self.word_vocab_size) if self.config.share_vocab: # existing bugs to be fixed self.projector.weight = self.dec_word_embedding.weight if self.config.use_kl_loss: self.kl = nn.KLDivLoss(size_average=False) if self.config.rl_ratio > 0.0 and self.config.rl_type == "bertscore": self.rl_metric = nlp.load_metric("bertscore")
```python import nlp # You need to give the total number of parallel python processes (num_process) and the id of each process (process_id) bleu_metric = nlp.load_metric('bleu', process_id=torch.distributed.get_rank(),b num_process=torch.distributed.get_world_size()) for batch in dataloader: model_input, targets = batch predictions = model(model_inputs) bleu_metric.add_batch(predictions, targets) score = bleu_metric.compute() # Compute the score on the first node by default (can be set to compute on each node as well) ``` Example with a NER metric: `seqeval` """ ner_metric = nlp.load_metric('seqeval') references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] ner_metric.compute(predictions, references) """# Adding a new dataset or a new metric They are two ways to add new datasets and metrics in `nlp`: - datasets can be added with a Pull-Request adding a script in the `datasets` folder of the [`nlp` repository](https://github.com/huggingface/nlp) => once the PR is merged, the dataset can be instantiate by it's folder name e.g. `nlp.load_dataset('squad')`. If you want HuggingFace to host the data as well you will need to ask the HuggingFace team to upload the data. - datasets can also be added with a direct upload using `nlp` CLI as a user or organization (like for models in `transformers`). In this case the dataset will be accessible under the gien user/organization name, e.g. `nlp.load_dataset('thomwolf/squad')`. In this case you can upload the data yourself at the same time and in the same folder. We will add a full tutorial on how to add and upload datasets soon.
def main(): args = get_args() dataset_dict = { "stsb": nlp.load_dataset('glue', name="stsb"), "rte": nlp.load_dataset('glue', name="rte"), "commonsense_qa": nlp.load_dataset('commonsense_qa'), } for task_name, dataset in dataset_dict.items(): print(task_name) print(dataset_dict[task_name]["train"][0]) print() multitask_model = MultitaskModel.create( model_name=model_name, model_type_dict={ "stsb": transformers.AutoModelForSequenceClassification, "rte": transformers.AutoModelForSequenceClassification, "commonsense_qa": transformers.AutoModelForMultipleChoice, }, model_config_dict={ "stsb": transformers.AutoConfig.from_pretrained(model_name, num_labels=1), "rte": transformers.AutoConfig.from_pretrained(model_name, num_labels=2), "commonsense_qa": transformers.AutoConfig.from_pretrained(model_name), }) if model_name.startswith("roberta-"): print(multitask_model.encoder.embeddings.word_embeddings.weight. data_ptr()) print(multitask_model.taskmodels_dict["stsb"].roberta.embeddings. word_embeddings.weight.data_ptr()) print(multitask_model.taskmodels_dict["rte"].roberta.embeddings. word_embeddings.weight.data_ptr()) print(multitask_model.taskmodels_dict["commonsense_qa"].roberta. embeddings.word_embeddings.weight.data_ptr()) convert_func_dict = { "stsb": convert_to_stsb_features, "rte": convert_to_rte_features, "commonsense_qa": convert_to_commonsense_qa_features, } columns_dict = { "stsb": ['input_ids', 'attention_mask', 'labels'], "rte": ['input_ids', 'attention_mask', 'labels'], "commonsense_qa": ['input_ids', 'attention_mask', 'labels'], } features_dict = {} for task_name, dataset in dataset_dict.items(): features_dict[task_name] = {} for phase, phase_dataset in dataset.items(): features_dict[task_name][phase] = phase_dataset.map( convert_func_dict[task_name], batched=True, load_from_cache_file=False, ) print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) features_dict[task_name][phase].set_format( type="torch", columns=columns_dict[task_name], ) print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase])) train_dataset = { task_name: dataset["train"] for task_name, dataset in features_dict.items() } trainer = MultitaskTrainer( model=multitask_model, args=transformers.TrainingArguments( output_dir=args.job_dir, overwrite_output_dir=True, learning_rate=1e-5, do_train=True, num_train_epochs=3, per_device_train_batch_size=args.batch_size, save_steps=3000, ), data_collator=NLPDataCollator(), train_dataset=train_dataset, ) trainer.train() preds_dict = {} for task_name in ["rte", "stsb", "commonsense_qa"]: eval_dataloader = DataLoaderWithTaskname( task_name, trainer.get_eval_dataloader( eval_dataset=features_dict[task_name]["validation"])) print(eval_dataloader.data_loader.collate_fn) preds_dict[task_name] = trainer._prediction_loop( eval_dataloader, description=f"Validation: {task_name}", ) # Evalute RTE nlp.load_metric('glue', name="rte").compute( np.argmax(preds_dict["rte"].predictions, axis=1), preds_dict["rte"].label_ids, ) # Evalute STS-B nlp.load_metric('glue', name="stsb").compute( preds_dict["stsb"].predictions.flatten(), preds_dict["stsb"].label_ids, ) # Evalute Commonsense QA np.mean( np.argmax(preds_dict["commonsense_qa"].predictions, axis=1) == preds_dict["commonsense_qa"].label_ids)
import nlp # Print all the available datasets res = nlp.list_datasets() # Load a dataset and print the first examples in the training set squad_dataset = nlp.load_dataset('squad') print(squad_dataset['train'][0]) # List all the available metrics print(nlp.list_metrics()) # Load a metric squad_metric = nlp.load_metric('squad')
def stats(dataset_script_path, dataset_cache_path, do_rouge): def words_counter(text): text = text.translate(str.maketrans(punctuation, " " * len(punctuation))) return len(text.split(" ")) def sentences_counter(text): return len(sent_tokenize(text)) rouge_metric = load_metric("rouge") num_sources = [] sum_num_words = [] sum_num_sentences = [] doc_num_words = [] doc_num_sentences = [] def compute_stats(example): # Rouge score prediction = example["clean_document"] reference = example["clean_summary"] rouge_metric.add(prediction, reference) # Number of sources num_sources.append(example["document"].count("|||") + 1) # Summary length sum_num_words.append(words_counter(example["clean_summary"])) sum_num_sentences.append(sentences_counter(example["clean_summary"])) # Document length doc_num_words.append(words_counter(example["clean_document"])) doc_num_sentences.append(sentences_counter(example["clean_document"])) dataset = load_dataset( dataset_script_path, cache_dir=dataset_cache_path, split="train+test+validation" ) dataset = dataset.map(compute_stats) if do_rouge: rouge_stats = rouge_metric.compute(rouge_types=["rouge1", "rouge2", "rougeL"]) # Print number of examples print(f"The dataset contains {len(dataset)} examples.") # Print number of sources stats print( "number of article with:\n - 1 source: {}\n - 2 sources: {}\n - 3 sources: {}\n - 4 sources: {}\n - more sources: {}".format( num_sources.count(1), num_sources.count(2), num_sources.count(3), num_sources.count(4), len(num_sources) - num_sources.count(1) - num_sources.count(2) - num_sources.count(3) - num_sources.count(4), ) ) # Print length stats print( "number of words in document:\t{}\nnumber of sentences in document:\t{}\nnumber of words in summary:\t{}\nnumber of sentences in summary:\t{}\n".format( np.mean(doc_num_words), np.mean(doc_num_sentences), np.mean(sum_num_words), np.mean(sum_num_sentences), ) ) # Print ROUGE stats if do_rouge: print( "Rouge-1 R:\t{}\nRouge-2 R:\t{}\nRouge-L R:\t{}\n".format( rouge_stats["rouge1"].mid.recall, rouge_stats["rouge2"].mid.recall, rouge_stats["rougeL"].mid.recall, ) ) return None
default=DEFAULT_MODEL_NAME) args = parser.parse_args() LOG.info("Parsed arguments %s", args) # Step 1: preprocess the dataset and load data lcsts = LCSTS(args.training_path, args.val_path, args.test_path, output_path=args.preprocess_output_path) LOG.info("Train files saved to path {}".format(lcsts.train_merged_csv)) LOG.info("Validation files saved to path {}".format(lcsts.val_merged_csv)) LOG.info("Test files saved to path {}".format(lcsts.test_merged_csv)) tokenizer = load_tokenizer(args.model_name) # load rouge for validation rouge = nlp.load_metric("rouge") # Load tokenizer if torch.cuda.device_count() > 0: with torch.cuda.device(0): import sys print('__Python VERSION:', sys.version) print('__pyTorch VERSION:', torch.__version__) print('__CUDA VERSION') from subprocess import call # call(["nvcc", "--version"]) does not work print('__CUDNN VERSION:', torch.backends.cudnn.version()) print('__Number CUDA Devices:', torch.cuda.device_count()) print('__Devices') call([ "nvidia-smi", "--format=csv",