def test_push_dataset_dict_to_hub_custom_features(self): features = Features({ "x": Value("int64"), "y": ClassLabel(names=["neg", "pos"]) }) ds = Dataset.from_dict({ "x": [1, 2, 3], "y": [0, 0, 1] }, features=features) local_ds = DatasetDict({"test": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def load_datasets(lang="es", random_state=2021, preprocessing_args={}): """ Load emotion recognition datasets """ train_df = load_df(paths[lang]["train"]) test_df = load_df(paths[lang]["test"]) train_df, dev_df = train_test_split(train_df, stratify=train_df["label"], random_state=random_state) for df in [train_df, dev_df, test_df]: for label, idx in label2id.items(): df.loc[df["label"] == label, "label"] = idx df["label"] = df["label"].astype(int) preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args) train_df.loc[:, "text"] = train_df["text"].apply(preprocess) dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess) test_df.loc[:, "text"] = test_df["text"].apply(preprocess) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=len(id2label), names=[id2label[k] for k in sorted(id2label.keys())]) }) train_dataset = Dataset.from_pandas(train_df, features=features) dev_dataset = Dataset.from_pandas(dev_df, features=features) test_dataset = Dataset.from_pandas(test_df, features=features) return train_dataset, dev_dataset, test_dataset
def complex_dataset(): features = { "translation": Translation(languages=("en", "fr")), "sentiment": ClassLabel(num_classes=2), } return datasets.Dataset.from_dict(COMPLEX_DATA, Features(features))
def dictionary(): table = csv.read_csv("./data/train.csv", parse_options=ParseOptions(delimiter="\t")) # datasets = load_dataset("csv", data_files="./data/train.csv", delimiter="\t", quoting=csv_lib.QUOTE_NONE) # train_dataset: Dataset = datasets["train"] # train_dataset = Dataset(arrow_table=table) # table = train_dataset.data aa = set(table.column("label").to_pylist()) class_label_ = table.column("label").unique() class_label = ClassLabel(num_classes=len(class_label_), names=class_label_.tolist()) # ner_ids_list: ChunkedArray = class_label.str2int(label.column('label').to_numpy()) return class_label
def add_label_names(dataset: Dataset, label_column: str, label_names: List[str]): """Adds `names` to a specified `label` column. All labels (i.e. integers) in the dataset should be < than the number of label names. Args: dataset: a Dataset to add label names to label_column: the name of the label column (such as `label` or `labels`) in the dataset label_names: a list of label names Returns: Dataset: A copy of the passed `dataset` with added label names """ new_features: Features = dataset.features.copy() new_features[label_column] = ClassLabel(names=label_names) return dataset.cast(new_features)
def test_push_dataset_to_hub_custom_features(self): features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) self.assertEqual(ds[:], hub_ds[:]) finally: self.cleanup_repo(ds_name)
def read_dataset_from_csv(csv_path): """ read the prepared csv data as Dataset object """ df = pd.read_csv(csv_path, converters={ 'token': str, 'written': str, 'spoken': str }) feature_tag = Sequence( ClassLabel(num_classes=3, names=list(pd.factorize(df['tag'])[1]))) df['tag'] = df['tag'].apply(feature_tag.feature.str2int) df_text = df.groupby(['sentence_id']).agg({'token': list, 'tag': list}) dataset = Dataset.from_pandas(df_text) dataset.features["tag"] = feature_tag return dataset
def load_datasets(preprocess_args={}): """ Return train, dev, test datasets """ train_files = glob(os.path.join(tass_dir, "train/*.tsv")) dev_files = glob(os.path.join(tass_dir, "dev/*.tsv")) test_files = glob(os.path.join(tass_dir, "test1.1/*.tsv")) train_dfs = {get_lang(file): load_df(file) for file in train_files} dev_dfs = {get_lang(file): load_df(file) for file in dev_files} test_dfs = { get_lang(file): load_df(file, test=True) for file in test_files } train_df = pd.concat(train_dfs.values()) dev_df = pd.concat(dev_dfs.values()) test_df = pd.concat(test_dfs.values()) print(len(train_df), len(dev_df), len(test_df)) """ Tokenize tweets """ preprocess_with_args = lambda x: preprocess_tweet(x, **preprocess_args) train_df["text"] = train_df["text"].apply(preprocess_with_args) dev_df["text"] = dev_df["text"].apply(preprocess_with_args) test_df["text"] = test_df["text"].apply(preprocess_with_args) features = Features({ 'text': Value('string'), 'lang': Value('string'), 'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"]) }) columns = ["text", "lang", "label"] train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset
def load_datasets(seed=2021, preprocessing_args={}): """ Return train, dev, test datasets """ train_df = load_df(os.path.join(semeval_dir, "train.csv")) test_df = load_df(os.path.join(semeval_dir, "test.csv")) train_df, dev_df = train_test_split(train_df, test_size=0.2) print(len(train_df), len(dev_df), len(test_df)) """ Tokenize tweets """ en_preprocess = lambda x: preprocess_tweet( x, lang="en", **preprocessing_args) train_df["text"] = train_df["text"].apply(en_preprocess) dev_df["text"] = dev_df["text"].apply(en_preprocess) test_df["text"] = test_df["text"].apply(en_preprocess) features = Features({ 'id': Value('int64'), 'text': Value('string'), 'label': ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"]) }) columns = ["text", "id", "label"] train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset
def load_datasets(): """ Return train, dev, test datasets """ train_files = glob("data/tass2020/train/*.tsv") dev_files = glob("data/tass2020/dev/*.tsv") test_files = glob("data/tass2020/test1.1/*.tsv") train_dfs = {get_lang(file): load_df(file) for file in train_files} dev_dfs = {get_lang(file): load_df(file) for file in dev_files} test_dfs = {get_lang(file): load_df(file) for file in test_files} train_df = pd.concat(train_dfs.values()) dev_df = pd.concat(dev_dfs.values()) test_df = pd.concat(test_dfs.values()) print(len(train_df), len(dev_df), len(test_df)) train_df["text"] = train_df["text"].apply(preprocess_tweet) dev_df["text"] = dev_df["text"].apply(preprocess_tweet) test_df["text"] = test_df["text"].apply(preprocess_tweet) features = Features({ 'text': Value('string'), 'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"]) }) train_dataset = Dataset.from_pandas(train_df[["text", "label"]], features=features) dev_dataset = Dataset.from_pandas(dev_df[["text", "label"]], features=features) test_dataset = Dataset.from_pandas(test_df[["text", "label"]], features=features) return train_dataset, dev_dataset, test_dataset
def predict(self, input_path, output_path): key = 'tmp' input_df = pd.DataFrame() if self.pretrained: input_df['src_token'] = read_txt(input_path) input_df['src_token'] = input_df['src_token'].str.lower() input_df['token'] = input_df['src_token'].str.split() input_df['tag'] = input_df['token'].apply(lambda x: ['O'] * len(x)) input_df['sentence_id'] = input_df.index trainer = Trainer(model=self.model, tokenizer=self.tokenizer, data_collator=self.data_collator) feature_tag = Sequence( ClassLabel(num_classes=3, names=self.label_list)) input_df['tag'] = input_df['tag'].apply( feature_tag.feature.str2int) eval_dataset = Dataset.from_pandas(input_df) eval_dataset.features["tag"] = feature_tag # predict tokenized_datasets = DatasetDict({ key: eval_dataset }).map(self.tokenize_and_align_labels, batched=True) _, true_predictions = self.predict_dataset(trainer, tokenized_datasets[key]) result = save_classifier_result(eval_dataset, true_predictions, output_path) return result else: input_df['token'] = read_txt(input_path) input_df['sentence_id'] = input_df.index input_df['tag'] = 'B' input_df.to_csv(output_path, index=False) print("Result saved to ", output_path) return input_df
'fn': 'grey', 'tb': 'beige' } id2label = {v: k for k, v in label2id.items()} LABELS = [label2id[L] for L in LABELS] from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D FEATURES = Features({ 'image': Array3D(dtype="int64", shape=(3, 224, 224)), 'input_ids': Sequence(feature=Value(dtype='int64')), 'attention_mask': Sequence(Value(dtype='int64')), 'token_type_ids': Sequence(Value(dtype='int64')), 'bbox': Array2D(dtype="int64", shape=(512, 4)), 'labels': Sequence(ClassLabel(names=LABELS + [max(LABELS) + 1])) }) NUM_LABELS = len(LABELS) PROCESSOR_PICKLE = f"processor_module{NUM_LABELS}.pickle" MODEL_PICKLE = f"model_module{NUM_LABELS}.pickle" EPOCHS_LAYOUT = 84 PDF_UPLOAD_DIR = hidden_folder + "/pdf_upload/" ELMO_DIFFERENCE_MODEL_PATH = hidden_folder + "elmo_difference_models" ELMO_DIFFERENCE_COLLECTION_PATH = hidden_folder + "elmo_difference_collection" PORT = 7789 TOPIC_TEXT_LENGTH = 180
def label_mapper(x): labels = ClassLabel(names=["neutral", "entails"]) return {"label": labels.str2int(x)}
def get_dataset(tokenizer, model_id=0, args=None, output_all_cols=False, data_dir=''): ds_path = os.path.join(data_dir, f'task2/preprocessed_data/model{model_id}', args.transformer) print(f'Dataset path: {ds_path}') try: encoded_ds = DatasetDict.load_from_disk(ds_path) print('Reloaded persisted dataset.') except: ds: DatasetDict = load_dataset("humicroedit", "subtask-2") glove, synset_sizes = None, None if model_id == 0: glove = torchtext.vocab.GloVe(name='840B', dim=300, cache=os.path.join( os.environ['HOME'], '.vector_cache')) synset_sizes = get_synsets_sizes(ds, task=2) for i in range(2): ds = ds.rename_column(f'edit{i+1}', f'word_fin{i+1}') ds = ds.map( get_preprocess_ds(glove=glove, synset_sizes=synset_sizes, idx=i + 1)) ds = ds.remove_columns([f'original{i+1}']) ds = ds.rename_column(f'meanGrade{i+1}', f'grade{i+1}') if model_id == 2: ds = ds.map(add_T5_input) ds = ds.rename_column('label', 'labels') binary_ds = ds.filter(lambda ex: ex['labels'] != 0).\ map(lambda ex: {'labels': ex['labels'] - 1}) binary_ds_features = ds['train'].features.copy() binary_ds_features['labels'] = ClassLabel( names=ds['train'].features['labels'].names[1:]) binary_ds = binary_ds.cast(binary_ds_features) encode_fn = get_encode(tokenizer, model_id=model_id) encoded_ds = binary_ds.map(encode_fn, batched=True, batch_size=100) print('Saving preprocessed dataset.') os.makedirs(ds_path) encoded_ds.save_to_disk(ds_path) if model_id == 0: from task1.data import get_encoded_ds_cols encoded_ds_cols = get_encoded_ds_cols(args) encoded_ds_cols = [ f'{col}{i+1}' for i in range(2) for col in encoded_ds_cols ] encoded_ds_cols += ['grade1', 'grade2'] elif model_id == 1 and args.transformer != 'distilbert-base-cased': encoded_ds_cols = ['input_ids', 'token_type_ids', 'attention_mask'] else: encoded_ds_cols = ['input_ids', 'attention_mask'] for _ds in encoded_ds.values(): _ds.set_format(type='torch', columns=encoded_ds_cols + ['labels'], output_all_columns=output_all_cols) return encoded_ds
bug_types = [ bug.bug_type for bug in sstubs if bug.bug_type not in ignored_bug_types ] train_data, test_data, train_labels, test_labels = train_test_split( all_data, all_labels, test_size=0.2, random_state=42, stratify=bug_types) train_data = itertools.chain.from_iterable(train_data) train_labels = itertools.chain.from_iterable(train_labels) test_data = itertools.chain.from_iterable(test_data) test_labels = itertools.chain.from_iterable(test_labels) class_names = ['not_buggy', 'buggy'] features = Features({ 'text': Value('string'), 'label': ClassLabel(names=class_names) }) raw_train_dataset = Dataset.from_dict( { 'text': train_data, 'label': train_labels }, features=features, ) raw_val_dataset = Dataset.from_dict( { 'text': test_data, 'label': test_labels }, features=features,
def main(): args = get_args() set_seed(args.seed) dataset = load_dataset("codeparrot/codecomplex", split="train") train_test = dataset.train_test_split(test_size=0.2) test_validation = train_test["test"].train_test_split(test_size=0.5) train_test_validation = DatasetDict({ "train": train_test["train"], "test": test_validation["train"], "valid": test_validation["test"], }) print("Loading tokenizer and model") tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7) model.config.pad_token_id = model.config.eos_token_id if args.freeze: for param in model.roberta.parameters(): param.requires_grad = False labels = ClassLabel(num_classes=7, names=list( set(train_test_validation["train"]["complexity"]))) def tokenize(example): inputs = tokenizer(example["src"], truncation=True, max_length=1024) label = labels.str2int(example["complexity"]) return { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "label": label, } tokenized_datasets = train_test_validation.map( tokenize, batched=True, remove_columns=train_test_validation["train"].column_names, ) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir=args.output_dir, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.num_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, weight_decay=0.01, metric_for_best_model="accuracy", run_name="complexity-java", report_to="wandb", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["valid"], tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) print("Training...") trainer.add_callback(CustomCallback(trainer)) trainer.train()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file table = csv.read_csv("./data/train.csv", parse_options=ParseOptions(delimiter="\t")) class_label_ = table.column("label").unique() class_label = ClassLabel(num_classes=len(class_label_), names=class_label_.tolist()) train = main_ner.process_data(data_args.train_file, class_label) test = main_ner.process_data(data_args.test_file, class_label) val = main_ner.process_data(data_args.validation_file, class_label) # table = csv.read_csv(data_args.train_file) extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, delimiter="\t", quoting=csv_lib.QUOTE_NONE) train_dataset = datasets["train"] test_dataset = datasets["test"] val_dataset = datasets["validation"] table = train_dataset.data label = table.column("label") class_label_ = label.unique() class_label = Sequence(feature=ClassLabel( num_classes=len(class_label_), names=class_label_.tolist())) train_dataset.features['ner_tags'] = class_label # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy()) # train_ner_array = pa.array(train_ner_list) # train_data = train_dataset.data.append_column("ner_tags", train_ner_array) train_dataset._data = train test_dataset.features['ner_tags'] = class_label test_dataset._data = test val_dataset.features['ner_tags'] = class_label # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy()) # val_ner_array = pa.array(val_ner_list) # val_data = val_dataset.data.append_column("ner_tags", val_ner_array) val_dataset._data = val # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list seq: Sequence = features[label_column_name] # label_list = ["O", "B-GENE", "I-GENE"] # label_to_id = {i: i for i in range(len(label_list))} if isinstance(seq.feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] if len(examples) == 3: for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs else: print("asdasdsa") tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: logger.info("*** Predict ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
def load_datasets(train_path=None, test_path=None, add_body=False, limit=None, preprocess=True): """ Load and return datasets Returns ------- train_dataset, dev_dataset, test_datasets: datasets.Dataset """ test_path = test_path or _test_path train_path = train_path or _train_path with open(train_path) as f: train_articles = json.load(f) with open(test_path) as f: test_articles = json.load(f) train_comments = [serialize(article, comment, add_body) for article in train_articles for comment in article["comments"]] test_comments = [serialize(article, comment, add_body) for article in test_articles for comment in article["comments"]] if limit: train_comments = train_comments[:limit] test_comments = test_comments[:limit] train_df = pd.DataFrame(train_comments) test_df = pd.DataFrame(test_comments) train_df, dev_df = train_test_split(train_df, test_size=0.2, random_state=20212021) """ Apply preprocessing: convert usernames to "usuario" and urls to URL """ if preprocess: from pandarallel import pandarallel pandarallel.initialize() for df in [train_df, dev_df, test_df]: df["text"] = df["original_text"].parallel_apply(preprocess_tweet) df["article_text"] = df["article_text"].parallel_apply(preprocess_tweet) features = Features({ 'id': Value('uint64'), 'title': Value('string'), 'text': Value('string'), 'article_text': Value('string'), 'HATEFUL': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"]) }) if add_body: features["body"] = Value('string') for cat in extended_hate_categories: """ Set for WOMEN, LGBTI...and also for CALLS """ features[cat] = ClassLabel(num_classes=2, names=["NO", "YES"]) columns = list(features.keys()) train_dataset = Dataset.from_pandas(train_df[columns], features=features) dev_dataset = Dataset.from_pandas(dev_df[columns], features=features) test_dataset = Dataset.from_pandas(test_df[columns], features=features) return train_dataset, dev_dataset, test_dataset